[Bugfix] Fix Sparse24 Compressed Tensors models (#33446)

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-02-12 02:15:16 -05:00
parent 80f2ba6ea6
commit e9cd691132
3 changed files with 17 additions and 15 deletions
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -207,18 +207,19 @@ class CompressedTensorsConfig(QuantizationConfig):
        # because Attention quantization on its own is not supported by vLLM.
        # It is coupled with KV-cache quantization, and if scales are present in the
        # checkpoint, they will be used properly.
-        grps_without_attn_quant = {}
-        for k, v in config["config_groups"].items():
-            # e.g. LlamaAttention, Qwen3Attention, etc.
-            if len(v["targets"]) == 1 and v["targets"][0].endswith("Attention"):
-                logger.warning(
-                    "Skipping CompressedTensors config group for %s. Attention quant "
-                    "is coupled with KV-cache quantization in vLLM.",
-                    v["targets"][0],
-                )
-                continue
-            grps_without_attn_quant[k] = v
-        config["config_groups"] = grps_without_attn_quant
+        if "config_groups" in config:
+            grps_without_attn_quant = {}
+            for k, v in config["config_groups"].items():
+                # e.g. LlamaAttention, Qwen3Attention, etc.
+                if len(v["targets"]) == 1 and v["targets"][0].endswith("Attention"):
+                    logger.warning(
+                        "Skipping CompressedTensors config group for %s. Attention "
+                        "quant is coupled with KV-cache quantization in vLLM.",
+                        v["targets"][0],
+                    )
+                    continue
+                grps_without_attn_quant[k] = v
+            config["config_groups"] = grps_without_attn_quant

        ignore: list[str] = cast(list[str], config.get("ignore", []))
        quant_format = cast(str, config.get("format"))
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -261,6 +261,7 @@ def get_quant_config(
    if (
        hf_quant_config is not None
        and hf_quant_config.get("quant_method") == "compressed-tensors"
+        and "config_groups" in hf_quant_config
    ):
        if hf_text_config is not None:
            n_heads = getattr(hf_text_config, "num_attention_heads", None)