[UX] Only show FP4 Marlin fallback warning for w4a4 models (#36806)

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Michael Goin
2026-03-12 10:19:35 +01:00
committed by GitHub
parent 3e64fe4a18
commit 57431d8231
3 changed files with 12 additions and 14 deletions

View File

@@ -324,6 +324,12 @@ class CompressedTensorsW4A4Mxfp4MoEMethod(CompressedTensorsMoEMethod):
)
delattr(layer, "w2_weight_packed")
logger.warning_once(
"Your GPU does not have native support for FP4 computation but "
"FP4 quantization is being used. Weight-only FP4 compression "
"will be used leveraging the Marlin kernel. This may degrade "
"performance for compute-heavy workloads."
)
prepare_moe_fp4_layer_for_marlin(layer)
self.moe_quant_config = self.get_fused_moe_quant_config(layer)

View File

@@ -147,13 +147,6 @@ def apply_fp4_marlin_linear(
def prepare_fp4_layer_for_marlin(
layer: torch.nn.Module, input_dtype: torch.dtype | None = None
) -> None:
logger.warning_once(
"Your GPU does not have native support for FP4 computation but "
"FP4 quantization is being used. Weight-only FP4 compression will "
"be used leveraging the Marlin kernel. This may degrade "
"performance for compute-heavy workloads."
)
is_nvfp4 = hasattr(layer, "weight_global_scale")
if input_dtype is not None and input_dtype.itemsize == 1:
if is_nvfp4:
@@ -335,13 +328,6 @@ def prepare_nvfp4_moe_layer_for_marlin(
def prepare_moe_fp4_layer_for_marlin(
layer: torch.nn.Module, input_dtype: torch.dtype | None = None
) -> None:
logger.warning_once(
"Your GPU does not have native support for FP4 computation but "
"FP4 quantization is being used. Weight-only FP4 compression will "
"be used leveraging the Marlin kernel. This may degrade "
"performance for compute-heavy workloads."
)
is_nvfp4 = hasattr(layer, "w13_weight_scale_2")
if input_dtype is not None and input_dtype.itemsize == 1:
if is_nvfp4:

View File

@@ -141,6 +141,12 @@ def convert_to_nvfp4_linear_kernel_format(
layer.weights_padding_cols = 0
if backend == NvFp4LinearBackend.MARLIN:
logger.warning_once(
"Your GPU does not have native support for FP4 computation but "
"FP4 quantization is being used. Weight-only FP4 compression "
"will be used leveraging the Marlin kernel. This may degrade "
"performance for compute-heavy workloads."
)
prepare_fp4_layer_for_marlin(layer)
elif backend == NvFp4LinearBackend.FLASHINFER_TRTLLM:
weight, weight_scale = prepare_weights_for_nvfp4_flashinfer_trtllm(