[Bugfix][Kernel] allow non-power-of-two head sizes in prefix prefill (#4128)

2024-04-18 00:51:28 -07:00
parent 53b018edcb
commit e8cc7967ff
2 changed files with 28 additions and 18 deletions
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -10,7 +10,7 @@ from vllm.attention.ops.prefix_prefill import context_attention_fwd

 NUM_HEADS = [64]
 NUM_QUERIES_PER_KV = [1, 8, 64]
-HEAD_SIZES = [128]
+HEAD_SIZES = [128, 96]
 DTYPES = [torch.float16]
 CUDA_DEVICES = [
    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)