[Quantization] Quark MXFP4 format loading (#16943)

2025-05-07 12:05:05 -07:00
parent f98e307588
commit db593aa67f
9 changed files with 289 additions and 3 deletions
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -220,7 +220,7 @@ def get_model_architecture(
    # Special handling for quantized Mixtral.
    # FIXME(woosuk): This is a temporary hack.
    mixtral_supported = [
-        "fp8", "compressed-tensors", "gptq_marlin", "awq_marlin"
+        "fp8", "compressed-tensors", "gptq_marlin", "awq_marlin", "quark"
    ]

    if (model_config.quantization is not None