[Bugfix] Disable w16a16 2of4 sparse CompressedTensors24 (#12417)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: mgoin <michael@neuralmagic.com>
2025-01-26 06:59:58 -05:00
parent 9ddc35220b
commit aa2cd2c43d
6 changed files with 263 additions and 169 deletions
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -9,6 +9,7 @@ from compressed_tensors.quantization import (QuantizationArgs,
                                             QuantizationType)
 from pydantic import BaseModel

+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                               UnquantizedLinearMethod)
@@ -27,6 +28,8 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.platforms import current_platform

+logger = init_logger(__name__)
+
 __all__ = ["CompressedTensorsLinearMethod"]

 SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config"
@@ -79,6 +82,8 @@ class CompressedTensorsConfig(QuantizationConfig):
            return UnquantizedLinearMethod()
        if isinstance(layer, LinearBase):
            scheme = self.get_scheme(layer=layer, layer_name=prefix)
+            if scheme is None:
+                return UnquantizedLinearMethod()
            layer.scheme = scheme
            return CompressedTensorsLinearMethod(self)
        if isinstance(layer, Attention):
@@ -340,10 +345,10 @@ class CompressedTensorsConfig(QuantizationConfig):
        raise NotImplementedError(
            "No compressed-tensors compatible scheme was found.")

-    def get_scheme(
-            self,
-            layer: torch.nn.Module,
-            layer_name: Optional[str] = None) -> "CompressedTensorsScheme":
+    def get_scheme(self,
+                   layer: torch.nn.Module,
+                   layer_name: Optional[str] = None
+                   ) -> Optional["CompressedTensorsScheme"]:
        """
        compressed-tensors supports non uniform in the following way:

@@ -353,10 +358,7 @@ class CompressedTensorsConfig(QuantizationConfig):
            which can be a full layer_name, a regex for a layer_name, or
            an nn.Module name.

-        We first check whether a layer is in the ignore group and use
-        CompressedTensorsUnquantized (i.e. fp16/bf16) scheme for the layer
-
-        We then detect whether a layer_name is found in any target and
+        Detect whether a layer_name is found in any target and
        use the quantization scheme corresponding to the matched target
        to select the CompressedTensorsScheme used for infernece.
        """
@@ -394,6 +396,13 @@ class CompressedTensorsConfig(QuantizationConfig):
        if self.supports_cutlass_24(weight_quant=weight_quant,
                                    input_quant=input_quant,
                                    sparsity_scheme=sparsity_scheme):
+            # FIXME(tlrmchlsmth): layers using W16A16 CUTLASS 2:4 sparse kernels
+            # currently produce bad output in some cases
+            if weight_quant is None:
+                logger.warning_once(
+                    "CompressedTensors24 scheme is disabled for the w16a16 "
+                    "case. Falling back to UnquantizedLinearMethod")
+                return None
            # Have a valid sparsity scheme
            # Validate layer is supported by Cutlass 2:4 Kernel
            scheme = CompressedTensors24(quantized=weight_quant is not None