[V1][TPU] Apply the ragged paged attention kernel fix and remove the padding. (#14846)

Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
2025-03-17 01:48:28 -07:00
parent 69698f257e
commit b4ad56c1bd
2 changed files with 8 additions and 11 deletions
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -23,8 +23,7 @@ from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
-from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
-                                               PallasAttentionBackend,
+from vllm.v1.attention.backends.pallas import (PallasAttentionBackend,
                                               PallasMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@@ -139,10 +138,8 @@ class TPUModelRunner:
                                            device="cpu")
        self.slot_mapping_np = self.slot_mapping_cpu.numpy()

-        padded_max_num_blocks_per_req = _get_padded_number(
-            self.max_num_blocks_per_req, NUM_KV_PAGES_PER_BLOCK)
        self.block_table_cpu = torch.zeros(
-            (self.max_num_tokens, padded_max_num_blocks_per_req),
+            (self.max_num_tokens, self.max_num_blocks_per_req),
            dtype=self.input_batch.block_table.get_cpu_tensor().dtype,
            device="cpu")