Add llmcompressor fp8 kv-cache quant (per-tensor and per-attn_head) (#30141)

Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com>
Signed-off-by: eldarkurtic <8884008+eldarkurtic@users.noreply.github.com>
This commit is contained in:
Eldar Kurtić
2026-01-22 21:29:57 +01:00
committed by GitHub
parent 955b43a5a5
commit 44f08af3a7
18 changed files with 558 additions and 263 deletions

View File

@@ -32,6 +32,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
sparse_cutlass_supported,
)
from vllm.platforms import current_platform
from vllm.v1.attention.backends.fa_utils import get_flash_attn_version
# AITER only supports per-channel-per-channel INT8 gemm
# and per-tensor-per-tensor INT8 GEMM.
@@ -360,9 +361,26 @@ def test_compressed_tensors_fp8(vllm_runner):
@pytest.mark.skipif(
not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
)
def test_compressed_tensors_kv_cache(vllm_runner):
model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
with vllm_runner(model_path, enforce_eager=True, kv_cache_dtype="fp8") as llm:
def test_compressed_tensors_kv_cache_fp8_per_tensor(vllm_runner):
model_path = "nm-testing/TinyLlama-1.1B-Chat-v1.0-kvcache-fp8-tensor"
with vllm_runner(model_path) as llm:
output = llm.generate_greedy("Hello world!", max_tokens=4)
assert output
@pytest.mark.skipif(
not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
)
def test_compressed_tensors_kv_cache_fp8_per_attn_head(vllm_runner):
model_path = "nm-testing/TinyLlama-1.1B-Chat-v1.0-kvcache-fp8-attn_head"
try:
fa_version = get_flash_attn_version()
except Exception:
pytest.skip("This test requires FlashAttention backend.")
if fa_version is None or fa_version < 3:
pytest.skip("This test requires FlashAttention version >= 3.")
with vllm_runner(model_path, attention_config={"backend": "FLASH_ATTN"}) as llm:
output = llm.generate_greedy("Hello world!", max_tokens=4)
assert output