[Kernel] GGUF MoeVec kernel (#16780)

Signed-off-by: SzymonOzog <szymon.ozog@aleph-alpha.com> Signed-off-by: SzymonOzog <szymon.ozog@gmail.com> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com>
2025-05-07 14:07:23 +08:00
parent c3e9d5060e
commit 1a45a61387
8 changed files with 544 additions and 16 deletions
--- a/tests/kernels/quantization/test_ggml.py
+++ b/tests/kernels/quantization/test_ggml.py
@@ -36,3 +36,9 @@ def test_ggml_opcheck(quant_type):
    opcheck(torch.ops._C.ggml_moe_a8,
            (x, qweight, sorted_token_ids, expert_ids, num_tokens_post_padded,
             quant_type, qweight.shape[0], 1, x.shape[0]))
+
+    topk_ids = torch.zeros((1, 1), device='cuda', dtype=torch.int32)
+
+    opcheck(
+        torch.ops._C.ggml_moe_a8_vec,
+        (x, qweight, topk_ids, 1, quant_type, qweight.shape[0], x.shape[0]))