Add llmcompressor fp8 kv-cache quant (per-tensor and per-attn_head) (#30141)

Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com> Signed-off-by: eldarkurtic <8884008+eldarkurtic@users.noreply.github.com>
2026-01-22 21:29:57 +01:00
parent 955b43a5a5
commit 44f08af3a7
18 changed files with 558 additions and 263 deletions
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -32,6 +32,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
    sparse_cutlass_supported,
 )
 from vllm.platforms import current_platform
+from vllm.v1.attention.backends.fa_utils import get_flash_attn_version

 # AITER only supports per-channel-per-channel INT8 gemm
 # and per-tensor-per-tensor INT8 GEMM.
@@ -360,9 +361,26 @@ def test_compressed_tensors_fp8(vllm_runner):
@pytest.mark.skipif(
    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
 )
-def test_compressed_tensors_kv_cache(vllm_runner):
-    model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
-    with vllm_runner(model_path, enforce_eager=True, kv_cache_dtype="fp8") as llm:
+def test_compressed_tensors_kv_cache_fp8_per_tensor(vllm_runner):
+    model_path = "nm-testing/TinyLlama-1.1B-Chat-v1.0-kvcache-fp8-tensor"
+    with vllm_runner(model_path) as llm:
+        output = llm.generate_greedy("Hello world!", max_tokens=4)
+        assert output
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
+)
+def test_compressed_tensors_kv_cache_fp8_per_attn_head(vllm_runner):
+    model_path = "nm-testing/TinyLlama-1.1B-Chat-v1.0-kvcache-fp8-attn_head"
+    try:
+        fa_version = get_flash_attn_version()
+    except Exception:
+        pytest.skip("This test requires FlashAttention backend.")
+    if fa_version is None or fa_version < 3:
+        pytest.skip("This test requires FlashAttention version >= 3.")
+
+    with vllm_runner(model_path, attention_config={"backend": "FLASH_ATTN"}) as llm:
        output = llm.generate_greedy("Hello world!", max_tokens=4)
        assert output