[torch.compile] Disable ar-rms fusion for ds3-fp4 & DP, fix CI test (#34392)

Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-02-15 09:33:57 -05:00
parent f07a128413
commit 23d825aba1
3 changed files with 46 additions and 3 deletions
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1687,6 +1687,20 @@ class ModelConfig:
    def is_quantized(self) -> bool:
        return getattr(self.hf_config, "quantization_config", None) is not None

+    def is_nvfp4_quantized(self) -> bool:
+        # ModelOpt NVFP4 checkpoints resolve to modelopt_fp4 quantization method
+        if self.quantization in ("modelopt_fp4",):
+            return True
+
+        # For Compressed Tensors we look for `"format": "nvfp4-pack-quantized"`
+        # in the quantization config
+        quant_config = self.model_arch_config.quantization_config
+        return (
+            self.quantization == "compressed-tensors"
+            and quant_config is not None
+            and "nvfp4" in quant_config.get("format", "").lower()
+        )
+

 def get_served_model_name(model: str, served_model_name: str | list[str] | None):
    """