[UX] Only show FP4 Marlin fallback warning for w4a4 models (#36806)

Co-authored-by: Claude <noreply@anthropic.com>
2026-03-12 10:19:35 +01:00
parent 3e64fe4a18
commit 57431d8231
3 changed files with 12 additions and 14 deletions
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -324,6 +324,12 @@ class CompressedTensorsW4A4Mxfp4MoEMethod(CompressedTensorsMoEMethod):
        )
        delattr(layer, "w2_weight_packed")

+        logger.warning_once(
+            "Your GPU does not have native support for FP4 computation but "
+            "FP4 quantization is being used. Weight-only FP4 compression "
+            "will be used leveraging the Marlin kernel. This may degrade "
+            "performance for compute-heavy workloads."
+        )
        prepare_moe_fp4_layer_for_marlin(layer)

        self.moe_quant_config = self.get_fused_moe_quant_config(layer)
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
@@ -147,13 +147,6 @@ def apply_fp4_marlin_linear(
 def prepare_fp4_layer_for_marlin(
    layer: torch.nn.Module, input_dtype: torch.dtype | None = None
 ) -> None:
-    logger.warning_once(
-        "Your GPU does not have native support for FP4 computation but "
-        "FP4 quantization is being used. Weight-only FP4 compression will "
-        "be used leveraging the Marlin kernel. This may degrade "
-        "performance for compute-heavy workloads."
-    )
-
    is_nvfp4 = hasattr(layer, "weight_global_scale")
    if input_dtype is not None and input_dtype.itemsize == 1:
        if is_nvfp4:
@@ -335,13 +328,6 @@ def prepare_nvfp4_moe_layer_for_marlin(
 def prepare_moe_fp4_layer_for_marlin(
    layer: torch.nn.Module, input_dtype: torch.dtype | None = None
 ) -> None:
-    logger.warning_once(
-        "Your GPU does not have native support for FP4 computation but "
-        "FP4 quantization is being used. Weight-only FP4 compression will "
-        "be used leveraging the Marlin kernel. This may degrade "
-        "performance for compute-heavy workloads."
-    )
-
    is_nvfp4 = hasattr(layer, "w13_weight_scale_2")
    if input_dtype is not None and input_dtype.itemsize == 1:
        if is_nvfp4:
--- a/vllm/model_executor/layers/quantization/utils/nvfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/nvfp4_utils.py
@@ -141,6 +141,12 @@ def convert_to_nvfp4_linear_kernel_format(
    layer.weights_padding_cols = 0

    if backend == NvFp4LinearBackend.MARLIN:
+        logger.warning_once(
+            "Your GPU does not have native support for FP4 computation but "
+            "FP4 quantization is being used. Weight-only FP4 compression "
+            "will be used leveraging the Marlin kernel. This may degrade "
+            "performance for compute-heavy workloads."
+        )
        prepare_fp4_layer_for_marlin(layer)
    elif backend == NvFp4LinearBackend.FLASHINFER_TRTLLM:
        weight, weight_scale = prepare_weights_for_nvfp4_flashinfer_trtllm(