[Kernel][Attention] Separate Attention.kv_scale into k_scale and v_scale (#6081)

2024-07-16 18:31:32 -04:00
parent 160e1d8c99
commit 978aed5300
33 changed files with 317 additions and 185 deletions
--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/test_blocksparse_attention.py
@@ -212,7 +212,7 @@ def test_paged_attention(
    key_cache, value_cache = key_caches[0], value_caches[0]

    # Using default kv_scale
-    kv_scale = 1.0
+    k_scale = v_scale = 1.0
    tp_rank = 0

    # Call the paged attention kernel.
@@ -231,7 +231,8 @@ def test_paged_attention(
            max_seq_len,
            alibi_slopes,
            kv_cache_dtype,
-            kv_scale,
+            k_scale,
+            v_scale,
            tp_rank=tp_rank,
            blocksparse_local_blocks=blocksparse_local_blocks,
            blocksparse_vert_stride=blocksparse_vert_stride,
@@ -267,7 +268,8 @@ def test_paged_attention(
            max_seq_len,
            alibi_slopes,
            kv_cache_dtype,
-            kv_scale,
+            k_scale,
+            v_scale,
            tp_rank=tp_rank,
            blocksparse_local_blocks=blocksparse_local_blocks,
            blocksparse_vert_stride=blocksparse_vert_stride,