[bitsandbytes]: support read bnb pre-quantized model (#5753)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-07-23 16:45:09 -07:00
parent 2f808e69ab
commit 87525fab92
8 changed files with 143 additions and 39 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -591,9 +591,11 @@ class LoadConfig:
                mainly for profiling.
            "tensorizer" will use CoreWeave's tensorizer library for
                fast weight loading.
+            "bitsandbytes" will load nf4 type weights.
        ignore_patterns: The list of patterns to ignore when loading the model.
            Default to "original/**/*" to avoid repeated loading of llama's 
            checkpoints.
+            
    """

    load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO