[Sparse24] [Deprecation] Remove Sparse24 CT integration and kernels (#36799)

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
2026-03-23 16:03:29 -04:00
parent fafe76b4af
commit 38364a7e32
17 changed files with 9 additions and 2674 deletions
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -12,7 +12,6 @@ from compressed_tensors.quantization import QuantizationType
 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.fused_moe import UnquantizedFusedMoEMethod
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensors24,
    CompressedTensorsLinearMethod,
    CompressedTensorsW4A4Fp4,
    CompressedTensorsW4A8Fp8,
@@ -27,9 +26,6 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import W8A8BlockFp8
 from vllm.model_executor.layers.quantization.utils.nvfp4_utils import (
    cutlass_fp4_supported,
 )
-from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    sparse_cutlass_supported,
-)
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.fa_utils import get_flash_attn_version

@@ -362,283 +358,6 @@ def test_compressed_tensors_kv_cache_fp8_per_attn_head(vllm_runner):
        assert output


-@pytest.mark.skipif(
-    not sparse_cutlass_supported(),
-    reason="Sparse FP8 is not yet supported on this GPU type.",
-)
-def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy, format="dense"):
-    assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-    assert isinstance(qkv_proj.scheme, CompressedTensors24)
-
-    assert qkv_proj.scheme.weight_quant.strategy == weight_strategy
-    assert qkv_proj.scheme.input_quant.strategy == input_strategy
-    assert qkv_proj.scheme.quantized
-    assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
-    sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
-    assert sparsity_map.get("Linear").format == format
-    assert sparsity_map.get("Linear").sparsity_structure == "2:4"
-
-
-@pytest.mark.skipif(
-    not current_platform.is_cuda() or not current_platform.has_device_capability(90),
-    reason="Sparse FP8 is not yet supported on this GPU type.",
-)
-@pytest.mark.parametrize(
-    "args_2of4",
-    [
-        (
-            "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
-            "channel",
-            "token",
-        ),
-        (
-            "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
-            "channel",
-            "tensor",
-        ),
-        (
-            "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
-            "tensor",
-            "tensor",
-        ),
-        (
-            "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
-            "tensor",
-            "token",
-        ),
-    ],
-)
-def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
-    model, weight_strategy, input_strategy = args_2of4
-    with vllm_runner(model, enforce_eager=True) as llm:
-
-        def check_model(model):
-            layer = model.model.layers[0]
-
-            qkv_proj = layer.self_attn.qkv_proj
-            assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn
-            _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
-
-        llm.apply_model(check_model)
-
-        output = llm.generate_greedy("Hello my name is", max_tokens=4)
-        print(output)
-        assert output
-
-
-@pytest.mark.skipif(
-    not current_platform.is_cuda() or not current_platform.has_device_capability(90),
-    reason="Sparse FP8 is not yet supported on this GPU type.",
-)
-@pytest.mark.parametrize(
-    "args_2of4",
-    [
-        (
-            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
-            "channel",
-            "token",
-        ),
-        (
-            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
-            "channel",
-            "tensor",
-        ),
-        (
-            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
-            "tensor",
-            "token",
-        ),
-        (
-            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
-            "tensor",
-            "tensor",
-        ),
-    ],
-)
-def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4):
-    model, weight_strategy, input_strategy = args_2of4
-    with vllm_runner(model, enforce_eager=True) as llm:
-
-        def check_model(model):
-            layer = model.model.layers[0]
-
-            qkv_proj = layer.self_attn.qkv_proj
-            assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn
-            _test_2of4_quant_models(
-                qkv_proj,
-                weight_strategy,
-                input_strategy,
-                format="sparse-24-bitmask",
-            )
-
-        llm.apply_model(check_model)
-
-        output = llm.generate_greedy("Hello my name is", max_tokens=4)
-        print(output)
-        assert output
-
-
-@pytest.mark.skipif(
-    not sparse_cutlass_supported(),
-    reason="cutlass is not yet supported on this GPU type.",
-)
-@pytest.mark.parametrize(
-    "args_2of4",
-    [
-        (
-            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
-            "channel",
-            "token",
-        ),
-        (
-            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
-            "channel",
-            "tensor",
-        ),
-        (
-            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
-            "tensor",
-            "token",
-        ),
-        (
-            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
-            "tensor",
-            "tensor",
-        ),
-    ],
-)
-def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4):
-    model, weight_strategy, input_strategy = args_2of4
-    with vllm_runner(model, enforce_eager=True) as llm:
-
-        def check_model(model):
-            layer = model.model.layers[0]
-
-            qkv_proj = layer.self_attn.qkv_proj
-            assert qkv_proj.scheme.weights_dtype == torch.int8
-            _test_2of4_quant_models(
-                qkv_proj,
-                weight_strategy,
-                input_strategy,
-                format="sparse-24-bitmask",
-            )
-
-        llm.apply_model(check_model)
-
-        output = llm.generate_greedy("Hello my name is", max_tokens=4)
-        print(output)
-        assert output
-
-
-@pytest.mark.skipif(
-    not sparse_cutlass_supported(),
-    reason="Sparse FP8 is not yet supported on this GPU type.",
-)
-@pytest.mark.parametrize(
-    "args_2of4",
-    [
-        (
-            "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
-            "channel",
-            "token",
-        ),
-        (
-            "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
-            "tensor",
-            "tensor",
-        ),
-        (
-            "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
-            "tensor",
-            "token",
-        ),
-    ],
-)
-def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
-    model, weight_strategy, input_strategy = args_2of4
-    with vllm_runner(model, enforce_eager=True) as llm:
-
-        def check_model(model):
-            layer = model.model.layers[0]
-
-            qkv_proj = layer.self_attn.qkv_proj
-            assert qkv_proj.scheme.weights_dtype == torch.int8
-            _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
-
-        llm.apply_model(check_model)
-
-        output = llm.generate_greedy("Hello my name is", max_tokens=4)
-        print(output)
-        assert output
-
-
-@pytest.mark.skipif(
-    not sparse_cutlass_supported(),
-    reason="2of4 Sparse is not yet supported on this GPU type.",
-)
-@pytest.mark.parametrize(
-    "args_2of4",
-    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")],
-)
-def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
-    model = args_2of4
-    with vllm_runner(model, enforce_eager=True) as llm:
-
-        def check_model(model):
-            layer = model.model.layers[0]
-
-            qkv_proj = layer.self_attn.qkv_proj
-            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-            assert isinstance(qkv_proj.scheme, CompressedTensors24)
-
-            assert qkv_proj.scheme.weight_quant is None
-            assert qkv_proj.scheme.input_quant is None
-            assert not qkv_proj.scheme.quantized
-            assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
-            sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
-            assert sparsity_map.get("Linear").format == "dense"
-            assert sparsity_map.get("Linear").sparsity_structure == "2:4"
-
-        llm.apply_model(check_model)
-
-        output = llm.generate_greedy("Hello my name is", max_tokens=4)
-        print(output)
-        assert output
-
-
-@pytest.mark.skipif(
-    not sparse_cutlass_supported(),
-    reason="Cutlass is not yet supported on this GPU type.",
-)
-@pytest.mark.parametrize(
-    "args_2of4", [("nm-testing/llama2.c-stories42M-pruned2.4-compressed")]
-)
-def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
-    model = args_2of4
-    with vllm_runner(model, enforce_eager=True) as llm:
-
-        def check_model(model):
-            layer = model.model.layers[0]
-
-            qkv_proj = layer.self_attn.qkv_proj
-            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-            assert isinstance(qkv_proj.scheme, CompressedTensors24)
-
-            assert qkv_proj.scheme.weight_quant is None
-            assert qkv_proj.scheme.input_quant is None
-            assert not qkv_proj.scheme.quantized
-            assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
-            sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
-            assert sparsity_map.get("Linear").format == "sparse-24-bitmask"
-            assert sparsity_map.get("Linear").sparsity_structure == "2:4"
-
-        llm.apply_model(check_model)
-
-        output = llm.generate_greedy("Hello my name is", max_tokens=4)
-        print(output)
-        assert output
-
-
@pytest.mark.skipif(
    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
 )