[Kernel][Attention] Separate Attention.kv_scale into k_scale and v_scale (#6081)

This commit is contained in:
Michael Goin
2024-07-16 18:31:32 -04:00
committed by GitHub
parent 160e1d8c99
commit 978aed5300
33 changed files with 317 additions and 185 deletions

View File

@@ -155,11 +155,11 @@ def test_reshape_and_cache(
cloned_value_cache = value_cache.clone()
# Using default kv_scale
kv_scale = 1.0
k_scale = v_scale = 1.0
# Call the reshape_and_cache kernel.
ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping,
kv_cache_dtype, kv_scale)
kv_cache_dtype, k_scale, v_scale)
if kv_cache_dtype == "fp8":
result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)