[Bugfix][Kernel] Add IQ1_M quantization implementation to GGUF kernel (#8357)

This commit is contained in:
Isotr0py
2024-09-16 06:51:44 +08:00
committed by GitHub
parent 3724d5f6b5
commit fc990f9795
8 changed files with 547 additions and 161 deletions

View File

@@ -55,7 +55,10 @@ class GGUFConfig(QuantizationConfig):
def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor,
qweight_type: int) -> torch.Tensor:
# use dequantize mulmat for IQmatrix, mmq for k-quants
if qweight_type >= 16:
if x.shape[0] == 1:
# enable mmvq in contiguous batching
y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0])
elif qweight_type >= 16:
block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size)
weight = ops.ggml_dequantize(qweight, qweight_type, *shape)