[UX] Only show FP4 Marlin fallback warning for w4a4 models (#36806)
Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -324,6 +324,12 @@ class CompressedTensorsW4A4Mxfp4MoEMethod(CompressedTensorsMoEMethod):
|
||||
)
|
||||
delattr(layer, "w2_weight_packed")
|
||||
|
||||
logger.warning_once(
|
||||
"Your GPU does not have native support for FP4 computation but "
|
||||
"FP4 quantization is being used. Weight-only FP4 compression "
|
||||
"will be used leveraging the Marlin kernel. This may degrade "
|
||||
"performance for compute-heavy workloads."
|
||||
)
|
||||
prepare_moe_fp4_layer_for_marlin(layer)
|
||||
|
||||
self.moe_quant_config = self.get_fused_moe_quant_config(layer)
|
||||
|
||||
@@ -147,13 +147,6 @@ def apply_fp4_marlin_linear(
|
||||
def prepare_fp4_layer_for_marlin(
|
||||
layer: torch.nn.Module, input_dtype: torch.dtype | None = None
|
||||
) -> None:
|
||||
logger.warning_once(
|
||||
"Your GPU does not have native support for FP4 computation but "
|
||||
"FP4 quantization is being used. Weight-only FP4 compression will "
|
||||
"be used leveraging the Marlin kernel. This may degrade "
|
||||
"performance for compute-heavy workloads."
|
||||
)
|
||||
|
||||
is_nvfp4 = hasattr(layer, "weight_global_scale")
|
||||
if input_dtype is not None and input_dtype.itemsize == 1:
|
||||
if is_nvfp4:
|
||||
@@ -335,13 +328,6 @@ def prepare_nvfp4_moe_layer_for_marlin(
|
||||
def prepare_moe_fp4_layer_for_marlin(
|
||||
layer: torch.nn.Module, input_dtype: torch.dtype | None = None
|
||||
) -> None:
|
||||
logger.warning_once(
|
||||
"Your GPU does not have native support for FP4 computation but "
|
||||
"FP4 quantization is being used. Weight-only FP4 compression will "
|
||||
"be used leveraging the Marlin kernel. This may degrade "
|
||||
"performance for compute-heavy workloads."
|
||||
)
|
||||
|
||||
is_nvfp4 = hasattr(layer, "w13_weight_scale_2")
|
||||
if input_dtype is not None and input_dtype.itemsize == 1:
|
||||
if is_nvfp4:
|
||||
|
||||
@@ -141,6 +141,12 @@ def convert_to_nvfp4_linear_kernel_format(
|
||||
layer.weights_padding_cols = 0
|
||||
|
||||
if backend == NvFp4LinearBackend.MARLIN:
|
||||
logger.warning_once(
|
||||
"Your GPU does not have native support for FP4 computation but "
|
||||
"FP4 quantization is being used. Weight-only FP4 compression "
|
||||
"will be used leveraging the Marlin kernel. This may degrade "
|
||||
"performance for compute-heavy workloads."
|
||||
)
|
||||
prepare_fp4_layer_for_marlin(layer)
|
||||
elif backend == NvFp4LinearBackend.FLASHINFER_TRTLLM:
|
||||
weight, weight_scale = prepare_weights_for_nvfp4_flashinfer_trtllm(
|
||||
|
||||
Reference in New Issue
Block a user