[Bugfix][CI] ALiBi test case in xformers multi_query_kv_attention (#11301)

This commit is contained in:
Nicolò Lucchesi
2025-03-06 05:00:53 +01:00
committed by GitHub
parent 3dbd2d813a
commit 5ee10e990d
3 changed files with 83 additions and 22 deletions

View File

@@ -439,14 +439,16 @@ def test_contexted_kv_attention_alibi(
# heads.
#
# see also: vllm/model_executor/layers/attention.py
query = query.view(query.shape[0], num_kv_heads, num_queries_per_kv,
query.shape[-1])
key = key[:, :, None, :].expand(key.shape[0], num_kv_heads,
num_queries_per_kv, key.shape[-1])
value = value[:, :,
None, :].expand(value.shape[0], num_kv_heads,
num_queries_per_kv, value.shape[-1])
# [seq, num_kv_heads, num_queries_per_kv, dk]=>
# [seq, num_kv_heads*num_queries_per_kv, dk] to comply with rest of the
# codebase. We save some time reshaping alibi matrix at runtime.
key = key.reshape(key.shape[0], -1, key.shape[-1])
value = value.reshape(value.shape[0], -1, value.shape[-1])
query = query.unsqueeze(0)
key = key.unsqueeze(0)
value = value.unsqueeze(0)