[Kernel] Add GPTQv2 format support for low-bit or asymmetric quantization, by adapting gptq_gemm (#26092)

This commit is contained in:
Xiangyu Li
2025-10-24 11:26:13 +08:00
committed by GitHub
parent 1f9460c4c1
commit 5cc6bddb6e
8 changed files with 295 additions and 98 deletions

View File

@@ -26,4 +26,10 @@ def test_gptq_gemm_opcheck():
idx = torch.empty((0,), device="cuda", dtype=torch.int32)
use_exllama = True
bit = 4
opcheck(torch.ops._C.gptq_gemm, (a, weight, zeros, scales, idx, use_exllama, bit))
# Test both GPTQv1 and GPTQv2 format
opcheck(
torch.ops._C.gptq_gemm, (a, weight, zeros, scales, idx, use_exllama, True, bit)
)
opcheck(
torch.ops._C.gptq_gemm, (a, weight, zeros, scales, idx, use_exllama, False, bit)
)