[Kernel] GGUF MoeVec kernel (#16780)

Signed-off-by: SzymonOzog <szymon.ozog@aleph-alpha.com>
Signed-off-by: SzymonOzog <szymon.ozog@gmail.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
This commit is contained in:
Szymon Ożóg
2025-05-07 14:07:23 +08:00
committed by GitHub
parent c3e9d5060e
commit 1a45a61387
8 changed files with 544 additions and 16 deletions

View File

@@ -145,7 +145,9 @@ def _fused_moe_gguf(
moe_align_block_size)
out_hidden_states = torch.empty_like(x)
if qweight_type2 in MMQ_QUANT_TYPES and qweight_type in MMQ_QUANT_TYPES:
# unless we decent expert reuse we are better off running moe_vec kernel
if (qweight_type2 in MMQ_QUANT_TYPES and qweight_type in MMQ_QUANT_TYPES
and x.shape[0] > 64):
num_tokens, _ = x.shape
E, N, _ = w1.shape
top_k = topk_ids.shape[1]
@@ -163,6 +165,20 @@ def _fused_moe_gguf(
out = out.reshape(num_tokens, top_k, w2.shape[1]).mul_(
topk_weights.view(num_tokens, top_k, 1))
ops.moe_sum(out, out_hidden_states)
elif qweight_type2 in MMVQ_QUANT_TYPES and qweight_type in MMVQ_QUANT_TYPES:
num_tokens, _ = x.shape
E, N, _ = w1.shape
top_k = topk_ids.shape[1]
out = ops.ggml_moe_a8_vec(x, w1, topk_ids, top_k, qweight_type, N,
num_tokens)
out = act(out)
out = ops.ggml_moe_a8_vec(out, w2, topk_ids, 1, qweight_type2,
w2.shape[1], num_tokens * top_k)
out = out.reshape(num_tokens, top_k, w2.shape[1]).mul_(
topk_weights.view(num_tokens, top_k, 1))
ops.moe_sum(out, out_hidden_states)
else:
logger.warning_once("There is no support for fast MoE kernel "
"for current quantization method. "