[Kernel] Add GPTQv2 format support for low-bit or asymmetric quantization, by adapting gptq_gemm (#26092)

2025-10-24 11:26:13 +08:00
parent 1f9460c4c1
commit 5cc6bddb6e
8 changed files with 295 additions and 98 deletions
--- a/tests/kernels/quantization/test_gptq.py
+++ b/tests/kernels/quantization/test_gptq.py
@@ -26,4 +26,10 @@ def test_gptq_gemm_opcheck():
    idx = torch.empty((0,), device="cuda", dtype=torch.int32)
    use_exllama = True
    bit = 4
-    opcheck(torch.ops._C.gptq_gemm, (a, weight, zeros, scales, idx, use_exllama, bit))
+    # Test both GPTQv1 and GPTQv2 format
+    opcheck(
+        torch.ops._C.gptq_gemm, (a, weight, zeros, scales, idx, use_exllama, True, bit)
+    )
+    opcheck(
+        torch.ops._C.gptq_gemm, (a, weight, zeros, scales, idx, use_exllama, False, bit)
+    )