[Model][Quantization] HQQ support through Marlin kernel expansion (#9766)

Signed-off-by: ElizaWszola <eliza@neuralmagic.com>
2024-11-19 22:31:12 +01:00
parent efa9084628
commit b00b33d77e
11 changed files with 632 additions and 89 deletions
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -303,7 +303,8 @@ def apply_gptq_marlin_linear(
                                  size_k=input_size_per_partition,
                                  is_k_full=is_k_full,
                                  has_zp=False,
-                                  use_fp32_reduce=use_fp32_reduce)
+                                  use_fp32_reduce=use_fp32_reduce,
+                                  is_zp_float=False)

    if bias is not None:
        output.add_(bias)  # In-place add
@@ -340,7 +341,8 @@ def apply_awq_marlin_linear(
                                  size_k=input_size_per_partition,
                                  is_k_full=True,
                                  has_zp=True,
-                                  use_fp32_reduce=use_fp32_reduce)
+                                  use_fp32_reduce=use_fp32_reduce,
+                                  is_zp_float=False)

    if bias is not None:
        output.add_(bias)  # In-place add