[Bugfix] Ensure calculated KV scales are applied in attention. (#27232)
Signed-off-by: adabeyta <aabeyta@redhat.com>
(cherry picked from commit a5a790eea6)
This commit is contained in:
committed by
Kevin H. Luu
parent
30700b1cd7
commit
75ecaf48fe
@@ -183,8 +183,14 @@ def test_custom_compile_config(
|
||||
"compilation_mode",
|
||||
[CompilationMode.NONE, CompilationMode.VLLM_COMPILE],
|
||||
)
|
||||
def test_fp8_kv_scale_compile(compilation_mode: int):
|
||||
model = "Qwen/Qwen2-0.5B"
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"Qwen/Qwen2-0.5B", # Standard attention model
|
||||
"deepseek-ai/DeepSeek-V2-Lite", # MLA (Multi-head Latent Attention) model
|
||||
],
|
||||
)
|
||||
def test_fp8_kv_scale_compile(compilation_mode: int, model: str):
|
||||
model_kwargs = {
|
||||
"quantization": "fp8",
|
||||
"kv_cache_dtype": "fp8_e4m3",
|
||||
|
||||
Reference in New Issue
Block a user