Revert "[v1] Support multiple KV cache groups in GPU model runner (#17945) (#18459)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
This commit is contained in:
Mark McLoughlin
2025-05-21 18:25:23 +01:00
committed by GitHub
parent dd5fa7e04f
commit bb0a311213
15 changed files with 214 additions and 481 deletions

View File

@@ -67,13 +67,13 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
max_model_len = self.runner.model_config.max_model_len
assert max_model_len == 32768,\
"AITER MLA requires max_model_len=32768"
assert self.kv_cache_spec.block_size == 1, "AITER MLA" \
assert self.runner.block_size == 1, "AITER MLA" \
"only supports block size 1."
def _get_paged_kv_tensors(
self, block_table: torch.Tensor,
seq_lens: torch.Tensor) -> tuple[torch.Tensor, ...]:
page_size = self.kv_cache_spec.block_size
page_size = self.runner.block_size
block_table_bounds = (seq_lens + page_size - 1) // page_size
mask = (torch.arange(block_table.size(1),