[Kernel][Attention] Separate Attention.kv_scale into k_scale and v_scale (#6081)

This commit is contained in:
Michael Goin
2024-07-16 18:31:32 -04:00
committed by GitHub
parent 160e1d8c99
commit 978aed5300
33 changed files with 317 additions and 185 deletions

View File

@@ -212,7 +212,7 @@ def test_paged_attention(
key_cache, value_cache = key_caches[0], value_caches[0]
# Using default kv_scale
kv_scale = 1.0
k_scale = v_scale = 1.0
tp_rank = 0
# Call the paged attention kernel.
@@ -231,7 +231,8 @@ def test_paged_attention(
max_seq_len,
alibi_slopes,
kv_cache_dtype,
kv_scale,
k_scale,
v_scale,
tp_rank=tp_rank,
blocksparse_local_blocks=blocksparse_local_blocks,
blocksparse_vert_stride=blocksparse_vert_stride,
@@ -267,7 +268,8 @@ def test_paged_attention(
max_seq_len,
alibi_slopes,
kv_cache_dtype,
kv_scale,
k_scale,
v_scale,
tp_rank=tp_rank,
blocksparse_local_blocks=blocksparse_local_blocks,
blocksparse_vert_stride=blocksparse_vert_stride,