[Misc] Support FP8 kv cache scales from compressed-tensors (#6528)

This commit is contained in:
Michael Goin
2024-07-23 00:11:50 -04:00
committed by GitHub
parent e519ae097a
commit 9e0b558a09
7 changed files with 186 additions and 75 deletions

View File

@@ -150,3 +150,10 @@ def test_compressed_tensors_fp8(vllm_runner):
output = llm.generate_greedy("Hello my name is", max_tokens=20)
assert output
def test_compressed_tensors_kv_cache(vllm_runner):
model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
output = llm.generate_greedy("Hello world!", max_tokens=20)
assert output