[Bugfix] Disable w16a16 2of4 sparse CompressedTensors24 (#12417)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
This commit is contained in:
Tyler Michael Smith
2025-01-26 06:59:58 -05:00
committed by GitHub
parent 9ddc35220b
commit aa2cd2c43d
6 changed files with 263 additions and 169 deletions

View File

@@ -313,8 +313,10 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
assert output
@pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.")
@pytest.mark.skipif(not sparse_cutlass_supported(),
reason="Sparse FP8 is not yet supported on this GPU type.")
reason="2of4 Sparse is not yet supported on this GPU type."
)
@pytest.mark.parametrize(
"args_2of4",
[("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])