[Quantization] add BNB for MixtralForCausalLM (#20893)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
Jee Jee Li
2025-07-14 15:34:34 +08:00
committed by GitHub
parent c488b928a7
commit a99b9f7dee
7 changed files with 128 additions and 20 deletions

View File

@@ -227,7 +227,12 @@ def get_model_architecture(
# Special handling for quantized Mixtral.
# FIXME(woosuk): This is a temporary hack.
mixtral_supported = [
"fp8", "compressed-tensors", "gptq_marlin", "awq_marlin", "quark"
"fp8",
"compressed-tensors",
"gptq_marlin",
"awq_marlin",
"quark",
"bitsandbytes",
]
vllm_supported_archs = ModelRegistry.get_supported_archs()