[Bugfix] Disable w16a16 2of4 sparse CompressedTensors24 (#12417)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: mgoin <michael@neuralmagic.com>
2025-01-26 06:59:58 -05:00
parent 9ddc35220b
commit aa2cd2c43d
6 changed files with 263 additions and 169 deletions
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -313,8 +313,10 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
        assert output


+@pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.")
@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse FP8 is not yet supported on this GPU type.")
+                    reason="2of4 Sparse is not yet supported on this GPU type."
+                    )
@pytest.mark.parametrize(
    "args_2of4",
    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])