[Attention] Refactor CUDA attention backend selection logic (#24794)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
This commit is contained in:
Matthew Bonanni
2025-11-11 06:40:44 -06:00
committed by GitHub
parent 2e78150d24
commit b30dfa03c5
61 changed files with 1338 additions and 1002 deletions

View File

@@ -93,6 +93,17 @@ def can_initialize(
"pickle error when loading `transformers.models.auto.CONFIG_MAPPING`"
)
if model_arch == "DeepseekV32ForCausalLM":
from vllm.platforms import current_platform
capability = current_platform.get_device_capability()
if capability and capability.major < 9:
pytest.skip(
f"DeepseekV32 requires Hopper (9.0+) or Blackwell (10.0+) "
f"for FLASHMLA_SPARSE backend. Current device has compute "
f"capability {capability.major}.{capability.minor}"
)
with (
patch.object(V1EngineCore, "_initialize_kv_caches", _initialize_kv_caches_v1),
monkeypatch.context() as m,