[HMA]Fix corner case when hybrid page_size can not be evenly divided issue (blk_size=64,tp=4) (#37467)

Signed-off-by: Chendi Xue <chendi.xue@intel.com> Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Signed-off-by: Chendi.Xue <chendi.xue@intel.com> Co-authored-by: Matthew Bonanni <mbonanni@redhat.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
2026-03-30 11:47:30 -05:00
parent b4a2f3ac36
commit 3b1dbaad4e
12 changed files with 220 additions and 186 deletions
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -851,6 +851,7 @@ def test_hybrid_attention_mamba_tensor_shapes():
        vllm_ctx = vllm_config.compilation_config.static_forward_context

        runner = GPUModelRunner(vllm_config, DEVICE)
+        current_platform.update_block_size_for_backend(vllm_config)
        kv_cache_spec = runner.get_kv_cache_spec()

        available_memory = 5 * GiB_bytes
@@ -1306,6 +1307,7 @@ def test_mamba_cache_raises_when_max_num_seqs_exceeds_blocks():
        assert fwd_context is not None

        runner = GPUModelRunner(vllm_config, DEVICE)
+        current_platform.update_block_size_for_backend(vllm_config)
        kv_cache_spec = runner.get_kv_cache_spec()

        available_memory = 5 * GiB_bytes