[Kernel][Attention] Separate Attention.kv_scale into k_scale and v_scale (#6081)

2024-07-16 18:31:32 -04:00
parent 160e1d8c99
commit 978aed5300
33 changed files with 317 additions and 185 deletions
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -155,11 +155,11 @@ def test_reshape_and_cache(
        cloned_value_cache = value_cache.clone()

    # Using default kv_scale
-    kv_scale = 1.0
+    k_scale = v_scale = 1.0

    # Call the reshape_and_cache kernel.
    ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping,
-                          kv_cache_dtype, kv_scale)
+                          kv_cache_dtype, k_scale, v_scale)

    if kv_cache_dtype == "fp8":
        result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)