[Misc] Support FP8 kv cache scales from compressed-tensors (#6528)

2024-07-23 00:11:50 -04:00
parent e519ae097a
commit 9e0b558a09
7 changed files with 186 additions and 75 deletions
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -150,3 +150,10 @@ def test_compressed_tensors_fp8(vllm_runner):

        output = llm.generate_greedy("Hello my name is", max_tokens=20)
        assert output
+
+
+def test_compressed_tensors_kv_cache(vllm_runner):
+    model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
+    with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
+        output = llm.generate_greedy("Hello world!", max_tokens=20)
+        assert output