[Kernel] Raise verbose error and consolidate num_heads/num_kv_heads divisibility check (#19339)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
This commit is contained in:
22quinn
2025-06-14 22:43:48 -07:00
committed by GitHub
parent ee1531bc38
commit 0b73736a0d
17 changed files with 24 additions and 19 deletions

View File

@@ -10,6 +10,7 @@ import torch
from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from tests.kernels.utils import opcheck
from vllm import _custom_ops as ops
from vllm.attention.layer import Attention, MultiHeadAttention
from vllm.platforms import current_platform
from vllm.utils import get_max_shared_memory_bytes
@@ -506,3 +507,18 @@ def test_multi_query_kv_attention_with_alibi(
device,
use_alibi=True,
)
@pytest.mark.parametrize("attention_cls", [Attention, MultiHeadAttention])
def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None:
head_size = 64
scale = float(1.0 / (head_size**0.5))
num_heads = 16
num_kv_heads = 5
with pytest.raises(AssertionError):
_ = attention_cls(
num_heads=num_heads,
head_size=head_size,
scale=scale,
num_kv_heads=num_kv_heads,
)