[Model][Quantization] HQQ support through Marlin kernel expansion (#9766)

Signed-off-by: ElizaWszola <eliza@neuralmagic.com>
This commit is contained in:
ElizaWszola
2024-11-19 22:31:12 +01:00
committed by GitHub
parent efa9084628
commit b00b33d77e
11 changed files with 632 additions and 89 deletions

View File

@@ -303,7 +303,8 @@ def apply_gptq_marlin_linear(
size_k=input_size_per_partition,
is_k_full=is_k_full,
has_zp=False,
use_fp32_reduce=use_fp32_reduce)
use_fp32_reduce=use_fp32_reduce,
is_zp_float=False)
if bias is not None:
output.add_(bias) # In-place add
@@ -340,7 +341,8 @@ def apply_awq_marlin_linear(
size_k=input_size_per_partition,
is_k_full=True,
has_zp=True,
use_fp32_reduce=use_fp32_reduce)
use_fp32_reduce=use_fp32_reduce,
is_zp_float=False)
if bias is not None:
output.add_(bias) # In-place add