[torch.compile] Disable ar-rms fusion for ds3-fp4 & DP, fix CI test (#34392)

Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
Luka Govedič
2026-02-15 09:33:57 -05:00
committed by GitHub
parent f07a128413
commit 23d825aba1
3 changed files with 46 additions and 3 deletions

View File

@@ -536,12 +536,34 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
)
class DeepseekV32ForCausalLM(VerifyAndUpdateConfig):
class DeepseekV3ForCausalLM(VerifyAndUpdateConfig):
@classmethod
def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
"""Disable AR-RMS-Quant fusion for DeepSeekV3 in NVFP4"""
# TODO: https://github.com/vllm-project/vllm/issues/34395
# disable AR-rms-fp4 fusion for DSv3+
ar_rms_enabled = vllm_config.compilation_config.pass_config.fuse_allreduce_rms
nvfp4 = vllm_config.model_config.is_nvfp4_quantized()
# Disable by default, warn if manually enabled:
if ar_rms_enabled is None and nvfp4:
vllm_config.compilation_config.pass_config.fuse_allreduce_rms = False
if ar_rms_enabled and nvfp4:
logger.warning(
"Allreduce-rms fusion broken for DeepSeekV3 with NVFP4 quant,"
"see https://github.com/vllm-project/vllm/issues/34395."
)
class DeepseekV32ForCausalLM(DeepseekV3ForCausalLM):
@classmethod
def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
"""
Updated fp8 cache to custom "fp8_ds_mla" format for DeepSeekV32
"""
super().verify_and_update_config(vllm_config)
hf_config = vllm_config.model_config.hf_config
# Mirror the check in vllm/model_executor/models/deepseek_v2.py
@@ -632,6 +654,7 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
"MambaForCausalLM": MambaModelConfig,
"Mamba2ForCausalLM": MambaModelConfig,
"FalconMambaForCausalLM": MambaModelConfig,
"DeepseekV3ForCausalLM": DeepseekV3ForCausalLM,
"DeepseekV32ForCausalLM": DeepseekV32ForCausalLM,
"NemotronHForCausalLM": NemotronHForCausalLMConfig,
"NemotronHPuzzleForCausalLM": NemotronHForCausalLMConfig,