diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu index 38b929be4..dbed5fa4e 100644 --- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu +++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu @@ -6,11 +6,11 @@ #include "cutlass_extensions/common.hpp" bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability) { - // sparse CUTLASS kernels need at least + // sparse CUTLASS kernels need exactly hopper and are not forward compatible // CUDA 12.2 and SM90 (Hopper) #if defined CUDA_VERSION - return CUDA_VERSION >= 12020 && cuda_device_capability >= 90; + return CUDA_VERSION >= 12020 && cuda_device_capability == 90; #endif return false; @@ -98,7 +98,7 @@ std::vector cutlass_sparse_compress(torch::Tensor const& a) { TORCH_CHECK_NOT_IMPLEMENTED( false, - "No compiled cutlass_sparse_compress for a compute capability less than " + "No compiled cutlass_sparse_compress for a compute capability equal to " "CUDA device capability: ", version_num); } diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index df3d733b7..9de2228b7 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -207,18 +207,19 @@ class CompressedTensorsConfig(QuantizationConfig): # because Attention quantization on its own is not supported by vLLM. # It is coupled with KV-cache quantization, and if scales are present in the # checkpoint, they will be used properly. - grps_without_attn_quant = {} - for k, v in config["config_groups"].items(): - # e.g. LlamaAttention, Qwen3Attention, etc. - if len(v["targets"]) == 1 and v["targets"][0].endswith("Attention"): - logger.warning( - "Skipping CompressedTensors config group for %s. Attention quant " - "is coupled with KV-cache quantization in vLLM.", - v["targets"][0], - ) - continue - grps_without_attn_quant[k] = v - config["config_groups"] = grps_without_attn_quant + if "config_groups" in config: + grps_without_attn_quant = {} + for k, v in config["config_groups"].items(): + # e.g. LlamaAttention, Qwen3Attention, etc. + if len(v["targets"]) == 1 and v["targets"][0].endswith("Attention"): + logger.warning( + "Skipping CompressedTensors config group for %s. Attention " + "quant is coupled with KV-cache quantization in vLLM.", + v["targets"][0], + ) + continue + grps_without_attn_quant[k] = v + config["config_groups"] = grps_without_attn_quant ignore: list[str] = cast(list[str], config.get("ignore", [])) quant_format = cast(str, config.get("format")) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 7025efd1c..43ea6f285 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -261,6 +261,7 @@ def get_quant_config( if ( hf_quant_config is not None and hf_quant_config.get("quant_method") == "compressed-tensors" + and "config_groups" in hf_quant_config ): if hf_text_config is not None: n_heads = getattr(hf_text_config, "num_attention_heads", None)