Categorize tests/kernels/ based on kernel type (#16799)

Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-23 07:21:07 -06:00
parent aa72d9a4ea
commit 6317a5174a
55 changed files with 80 additions and 49 deletions
--- a/tests/kernels/quantization/test_ggml.py
+++ b/tests/kernels/quantization/test_ggml.py
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import gguf
+import pytest
+import torch
+
+from tests.kernels.utils import opcheck
+from vllm import _custom_ops as ops  # noqa: F401
+
+
+@pytest.mark.parametrize("quant_type", [12])
+def test_ggml_opcheck(quant_type):
+    block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type]
+    shape = [256, 1152]
+    qweight = torch.randint(0, 100, shape, device='cuda', dtype=torch.uint8)
+    m = qweight.shape[0]
+    n = qweight.shape[1] // type_size * block_size
+    opcheck(torch.ops._C.ggml_dequantize,
+            (qweight, quant_type, m, n, torch.float16))
+
+    x = torch.rand((m, 512), device='cuda', dtype=torch.float16)
+    opcheck(torch.ops._C.ggml_mul_mat_a8,
+            (qweight, x, quant_type, qweight.shape[0]))
+    opcheck(torch.ops._C.ggml_mul_mat_vec_a8,
+            (qweight, x, quant_type, qweight.shape[0]))
+
+    shape = [256, 1024, 336]
+    qweight = torch.randint(0, 100, shape, device='cuda', dtype=torch.uint8)
+    x = torch.rand((1, 1024), device='cuda', dtype=torch.float16)
+    sorted_token_ids = torch.arange(776, device='cuda')
+    expert_ids = torch.randint(0, 256, (194, ), device='cuda')
+    num_tokens_post_padded = torch.tensor([1],
+                                          dtype=torch.int64,
+                                          device='cuda')
+
+    opcheck(torch.ops._C.ggml_moe_a8,
+            (x, qweight, sorted_token_ids, expert_ids, num_tokens_post_padded,
+             quant_type, qweight.shape[0], 1, x.shape[0]))