[v1][attention] Support Hybrid Allocator + FlashInfer (#21412)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-07-29 18:45:29 -07:00
parent 0e36abf993
commit 555e7225bc
16 changed files with 85 additions and 57 deletions
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -745,7 +745,8 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
    layer_4 = "model.layers.4.mixer"
    layer_5 = "model.layers.5.mixer"

-    with set_current_vllm_config(vllm_config):
+    with set_current_vllm_config(vllm_config), monkeypatch.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
        hf_config = vllm_config.model_config.hf_config
        fwd_context = {}
        for key in [layer_0, layer_1]: