fix: Add glm4_moe_lite to MLA detection (#32614)

Signed-off-by: marksverdhei <marksverdhei@hotmail.com> Signed-off-by: Markus / Mark <46672778+marksverdhei@users.noreply.github.com> Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com> Co-authored-by: mgoin <mgoin64@gmail.com>
2026-01-23 21:38:57 +01:00
parent 3a41459501
commit 586a57ad7e
4 changed files with 70 additions and 10 deletions
--- a/vllm/v1/attention/backends/mla/flashinfer_mla.py
+++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py
@@ -63,6 +63,32 @@ class FlashInferMLABackend(MLACommonBackend):
    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
        return capability.major == 10

+    @classmethod
+    def supports_combination(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: CacheDType | None,
+        block_size: int,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        device_capability: DeviceCapability,
+    ) -> str | None:
+        # FlashInfer MLA kernel requires qk_nope_head_dim == 128
+        from vllm.config import get_current_vllm_config
+
+        vllm_config = get_current_vllm_config()
+        if vllm_config.model_config is not None:
+            hf_text_config = vllm_config.model_config.hf_text_config
+            qk_nope_head_dim = getattr(hf_text_config, "qk_nope_head_dim", 1)
+            if qk_nope_head_dim != 128:
+                return (
+                    f"FlashInfer MLA kernel requires qk_nope_head_dim == 128, "
+                    f"but got {qk_nope_head_dim}"
+                )
+        return None
+
    @classmethod
    def get_required_kv_cache_layout(cls) -> "KVCacheLayoutType | None":
        return "HND"