[Bugfix] Ensure calculated KV scales are applied in attention. (#27232)

Signed-off-by: adabeyta <aabeyta@redhat.com> (cherry picked from commit a5a790eea6)
2025-11-10 17:42:37 -06:00
parent 30700b1cd7
commit 75ecaf48fe
4 changed files with 29 additions and 36 deletions
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -183,8 +183,14 @@ def test_custom_compile_config(
    "compilation_mode",
    [CompilationMode.NONE, CompilationMode.VLLM_COMPILE],
 )
-def test_fp8_kv_scale_compile(compilation_mode: int):
-    model = "Qwen/Qwen2-0.5B"
+@pytest.mark.parametrize(
+    "model",
+    [
+        "Qwen/Qwen2-0.5B",  # Standard attention model
+        "deepseek-ai/DeepSeek-V2-Lite",  # MLA (Multi-head Latent Attention) model
+    ],
+)
+def test_fp8_kv_scale_compile(compilation_mode: int, model: str):
    model_kwargs = {
        "quantization": "fp8",
        "kv_cache_dtype": "fp8_e4m3",