[Quantization] add BNB for MixtralForCausalLM (#20893)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-14 15:34:34 +08:00
parent c488b928a7
commit a99b9f7dee
7 changed files with 128 additions and 20 deletions
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -227,7 +227,12 @@ def get_model_architecture(
    # Special handling for quantized Mixtral.
    # FIXME(woosuk): This is a temporary hack.
    mixtral_supported = [
-        "fp8", "compressed-tensors", "gptq_marlin", "awq_marlin", "quark"
+        "fp8",
+        "compressed-tensors",
+        "gptq_marlin",
+        "awq_marlin",
+        "quark",
+        "bitsandbytes",
    ]

    vllm_supported_archs = ModelRegistry.get_supported_archs()