Add llmcompressor fp8 kv-cache quant (per-tensor and per-attn_head) (#30141)
Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com> Signed-off-by: eldarkurtic <8884008+eldarkurtic@users.noreply.github.com>
This commit is contained in:
@@ -32,6 +32,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||
sparse_cutlass_supported,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.v1.attention.backends.fa_utils import get_flash_attn_version
|
||||
|
||||
# AITER only supports per-channel-per-channel INT8 gemm
|
||||
# and per-tensor-per-tensor INT8 GEMM.
|
||||
@@ -360,9 +361,26 @@ def test_compressed_tensors_fp8(vllm_runner):
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
|
||||
)
|
||||
def test_compressed_tensors_kv_cache(vllm_runner):
|
||||
model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
|
||||
with vllm_runner(model_path, enforce_eager=True, kv_cache_dtype="fp8") as llm:
|
||||
def test_compressed_tensors_kv_cache_fp8_per_tensor(vllm_runner):
|
||||
model_path = "nm-testing/TinyLlama-1.1B-Chat-v1.0-kvcache-fp8-tensor"
|
||||
with vllm_runner(model_path) as llm:
|
||||
output = llm.generate_greedy("Hello world!", max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
|
||||
)
|
||||
def test_compressed_tensors_kv_cache_fp8_per_attn_head(vllm_runner):
|
||||
model_path = "nm-testing/TinyLlama-1.1B-Chat-v1.0-kvcache-fp8-attn_head"
|
||||
try:
|
||||
fa_version = get_flash_attn_version()
|
||||
except Exception:
|
||||
pytest.skip("This test requires FlashAttention backend.")
|
||||
if fa_version is None or fa_version < 3:
|
||||
pytest.skip("This test requires FlashAttention version >= 3.")
|
||||
|
||||
with vllm_runner(model_path, attention_config={"backend": "FLASH_ATTN"}) as llm:
|
||||
output = llm.generate_greedy("Hello world!", max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
Reference in New Issue
Block a user