[Attention] Refactor CUDA attention backend selection logic (#24794)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-11-11 06:40:44 -06:00
parent 2e78150d24
commit b30dfa03c5
61 changed files with 1338 additions and 1002 deletions
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -93,6 +93,17 @@ def can_initialize(
            "pickle error when loading `transformers.models.auto.CONFIG_MAPPING`"
        )

+    if model_arch == "DeepseekV32ForCausalLM":
+        from vllm.platforms import current_platform
+
+        capability = current_platform.get_device_capability()
+        if capability and capability.major < 9:
+            pytest.skip(
+                f"DeepseekV32 requires Hopper (9.0+) or Blackwell (10.0+) "
+                f"for FLASHMLA_SPARSE backend. Current device has compute "
+                f"capability {capability.major}.{capability.minor}"
+            )
+
    with (
        patch.object(V1EngineCore, "_initialize_kv_caches", _initialize_kv_caches_v1),
        monkeypatch.context() as m,