[V1] Support any head size for FlexAttention backend (#20467)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-07 00:54:36 +08:00
parent e202dd2736
commit 9fb52e523a
20 changed files with 202 additions and 118 deletions
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -33,9 +33,6 @@ if current_platform.is_rocm():
    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"

 REQUIRES_V0_MODELS = [
-    # V1 Test: no way to fall back for head_dim = 80
-    # https://github.com/vllm-project/vllm/issues/14524
-    "qwen_vl",
    # V1 Test: not enough KV cache space in C1.
    "fuyu",
 ]
@@ -221,8 +218,7 @@ VLM_TEST_SETTINGS = {
        marks=[large_gpu_mark(min_gb=32)],
    ),
    "blip2": VLMTestInfo(
-        # TODO: Change back to 2.7b once head_dim = 80 is supported
-        models=["Salesforce/blip2-opt-6.7b"],
+        models=["Salesforce/blip2-opt-2.7b"],
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
        img_idx_to_prompt=lambda idx: "",
@@ -340,8 +336,7 @@ VLM_TEST_SETTINGS = {
    "h2ovl": VLMTestInfo(
        models = [
            "h2oai/h2ovl-mississippi-800m",
-            # TODO: Re-enable once head_dim = 80 is supported
-            # "h2oai/h2ovl-mississippi-2b",
+            "h2oai/h2ovl-mississippi-2b",
        ],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501