[Kernel] GGUF MoeVec kernel (#16780)

Signed-off-by: SzymonOzog <szymon.ozog@aleph-alpha.com> Signed-off-by: SzymonOzog <szymon.ozog@gmail.com> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com>
2025-05-07 14:07:23 +08:00
parent c3e9d5060e
commit 1a45a61387
8 changed files with 544 additions and 16 deletions
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -145,7 +145,9 @@ def _fused_moe_gguf(
        moe_align_block_size)

    out_hidden_states = torch.empty_like(x)
-    if qweight_type2 in MMQ_QUANT_TYPES and qweight_type in MMQ_QUANT_TYPES:
+    # unless we decent expert reuse we are better off running moe_vec kernel
+    if (qweight_type2 in MMQ_QUANT_TYPES and qweight_type in MMQ_QUANT_TYPES
+            and x.shape[0] > 64):
        num_tokens, _ = x.shape
        E, N, _ = w1.shape
        top_k = topk_ids.shape[1]
@@ -163,6 +165,20 @@ def _fused_moe_gguf(
        out = out.reshape(num_tokens, top_k, w2.shape[1]).mul_(
            topk_weights.view(num_tokens, top_k, 1))
        ops.moe_sum(out, out_hidden_states)
+    elif qweight_type2 in MMVQ_QUANT_TYPES and qweight_type in MMVQ_QUANT_TYPES:
+        num_tokens, _ = x.shape
+        E, N, _ = w1.shape
+        top_k = topk_ids.shape[1]
+
+        out = ops.ggml_moe_a8_vec(x, w1, topk_ids, top_k, qweight_type, N,
+                                  num_tokens)
+        out = act(out)
+
+        out = ops.ggml_moe_a8_vec(out, w2, topk_ids, 1, qweight_type2,
+                                  w2.shape[1], num_tokens * top_k)
+        out = out.reshape(num_tokens, top_k, w2.shape[1]).mul_(
+            topk_weights.view(num_tokens, top_k, 1))
+        ops.moe_sum(out, out_hidden_states)
    else:
        logger.warning_once("There is no support for fast MoE kernel "
                            "for current quantization method. "