support bitsandbytes 8-bit and FP4 quantized models (#7445)

This commit is contained in:
chenqianfzh
2024-08-29 16:09:08 -07:00
committed by GitHub
parent 257afc37c5
commit 4664ceaad6
6 changed files with 435 additions and 189 deletions

View File

@@ -405,6 +405,8 @@ class ModelConfig:
raise ValueError(
"BitAndBytes quantization with TP or PP is not supported yet.")
# Remove the constraint after the bitsandbytes issue is fixed:
# https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1308
if self.quantization == "bitsandbytes" and self.enforce_eager is False:
logger.warning("CUDA graph is not supported on BitAndBytes yet, "
"fallback to the eager mode.")