[Kernel/Quant] Remove the original marlin format and qqq (#23204)

Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
Michael Goin
2025-08-20 15:13:36 -04:00
committed by GitHub
parent ebe56a0064
commit 0cdbf5e61c
26 changed files with 92 additions and 3698 deletions

View File

@@ -1112,9 +1112,9 @@ class ModelConfig:
def _verify_quantization(self) -> None:
supported_quantization = me_quant.QUANTIZATION_METHODS
optimized_quantization_methods = [
"fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
"awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8",
"quark", "modelopt_fp4", "bitblas", "gptq_bitblas", "inc"
"fp8", "modelopt", "gptq_marlin_24", "gptq_marlin", "awq_marlin",
"fbgemm_fp8", "compressed-tensors", "experts_int8", "quark",
"modelopt_fp4", "bitblas", "gptq_bitblas", "inc"
]
if self.quantization is not None:
self.quantization = cast(me_quant.QuantizationMethods,
@@ -1137,7 +1137,6 @@ class ModelConfig:
# `override_quantization_method` method) must be checked in order
# of preference (this is particularly important for GPTQ).
overrides = [
"marlin",
"bitblas",
"gptq_marlin_24",
"gptq_marlin",