[CPU] Support FP8 KV cache (#14741)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-03-15 13:07:36 +08:00
parent 877e352262
commit a2ae496589
8 changed files with 122 additions and 36 deletions
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -11,6 +11,7 @@ import pytest

 from tests.kernels.utils import override_backend_env_variable
 from tests.quantization.utils import is_quant_method_supported
+from vllm.platforms import current_platform

 from ...utils import check_logprobs_close

@@ -93,3 +94,63 @@ def test_models(
        name_0="fp16_kv_cache",
        name_1="fp8_kv_cache",
    )
+
+
+@pytest.mark.cpu_model
+@pytest.mark.skipif(not current_platform.is_cpu(),
+                    reason="test for the CPU backend.")
+@pytest.mark.parametrize(
+    "kv_cache_dtype,base_model,test_model",
+    [
+        # Test BF16 checkpoint w. fp8_e5m2 kv-cache.
+        ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
+         "meta-llama/Llama-3.2-1B-Instruct"),
+    ])
+# Due to low-precision numerical divergence, we only test logprob of 4 tokens
+@pytest.mark.parametrize("max_tokens", [4])
+# Due to low-precision numerical divergence, this test is too sensitive for
+# the async postprocessor
+@pytest.mark.parametrize("disable_async_output_proc", [True])
+def test_cpu_models(
+    vllm_runner,
+    example_prompts,
+    kv_cache_dtype: str,
+    base_model: str,
+    test_model: str,
+    max_tokens: int,
+    disable_async_output_proc: bool,
+) -> None:
+    """
+    Only checks log probs match to cover the discrepancy in
+    numerical sensitive kernels.
+    """
+
+    MAX_MODEL_LEN = 1024
+    NUM_LOG_PROBS = 8
+
+    with vllm_runner(
+            base_model,
+            max_model_len=MAX_MODEL_LEN,
+            dtype="bfloat16",
+            kv_cache_dtype="auto",
+            disable_async_output_proc=disable_async_output_proc,
+    ) as vllm_model:
+        baseline_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, NUM_LOG_PROBS)
+
+    with vllm_runner(
+            test_model,
+            max_model_len=MAX_MODEL_LEN,
+            dtype="bfloat16",
+            kv_cache_dtype=kv_cache_dtype,
+            disable_async_output_proc=disable_async_output_proc,
+    ) as vllm_model:
+        test_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, NUM_LOG_PROBS)
+
+    check_logprobs_close(
+        outputs_0_lst=baseline_outputs,
+        outputs_1_lst=test_outputs,
+        name_0="bf16_kv_cache",
+        name_1="fp8_kv_cache",
+    )