Bump Flashinfer Version and Re-enable DeepSeek NVFP4 AR+Norm Fusion (#34899)

Signed-off-by: wzhao18 <wzhao18.sz@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-02-20 16:37:31 -05:00
parent 0632ed8778
commit ea5f903f80
5 changed files with 6 additions and 29 deletions
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -536,34 +536,12 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
            )


-class DeepseekV3ForCausalLM(VerifyAndUpdateConfig):
-    @classmethod
-    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
-        """Disable AR-RMS-Quant fusion for DeepSeekV3 in NVFP4"""
-        # TODO: https://github.com/vllm-project/vllm/issues/34395
-
-        # disable AR-rms-fp4 fusion for DSv3+
-        ar_rms_enabled = vllm_config.compilation_config.pass_config.fuse_allreduce_rms
-        nvfp4 = vllm_config.model_config.is_nvfp4_quantized()
-
-        # Disable by default, warn if manually enabled:
-        if ar_rms_enabled is None and nvfp4:
-            vllm_config.compilation_config.pass_config.fuse_allreduce_rms = False
-        if ar_rms_enabled and nvfp4:
-            logger.warning(
-                "Allreduce-rms fusion broken for DeepSeekV3 with NVFP4 quant,"
-                "see https://github.com/vllm-project/vllm/issues/34395."
-            )
-
-
-class DeepseekV32ForCausalLM(DeepseekV3ForCausalLM):
+class DeepseekV32ForCausalLM(VerifyAndUpdateConfig):
    @classmethod
    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
        """
        Updated fp8 cache to custom "fp8_ds_mla" format for DeepSeekV32
        """
-        super().verify_and_update_config(vllm_config)
-
        hf_config = vllm_config.model_config.hf_config

        # Mirror the check in vllm/model_executor/models/deepseek_v2.py
@@ -654,7 +632,6 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
    "MambaForCausalLM": MambaModelConfig,
    "Mamba2ForCausalLM": MambaModelConfig,
    "FalconMambaForCausalLM": MambaModelConfig,
-    "DeepseekV3ForCausalLM": DeepseekV3ForCausalLM,
    "DeepseekV32ForCausalLM": DeepseekV32ForCausalLM,
    "NemotronHForCausalLM": NemotronHForCausalLMConfig,
    "NemotronHPuzzleForCausalLM": NemotronHForCausalLMConfig,