diff --git a/vllm/config/load.py b/vllm/config/load.py index 1ae32423f..e77d9b378 100644 --- a/vllm/config/load.py +++ b/vllm/config/load.py @@ -24,38 +24,43 @@ class LoadConfig: """Configuration for loading the model weights.""" load_format: str | LoadFormats = "auto" - """The format of the model weights to load:\n + """ + The format of the model weights to load. + - "auto" will try to load the weights in the safetensors format and fall - back to the pytorch bin format if safetensors format is not available.\n - - "pt" will load the weights in the pytorch bin format.\n - - "safetensors" will load the weights in the safetensors format.\n + back to the pytorch bin format if safetensors format is not available. + - "pt" will load the weights in the pytorch bin format. + - "safetensors" will load the weights in the safetensors format. - "instanttensor" will load the Safetensors weights on CUDA devices using - InstantTensor, which enables distributed loading with pipelined prefetching - and fast direct I/O.\n + InstantTensor, which enables distributed loading with pipelined prefetching + and fast direct I/O. - "npcache" will load the weights in pytorch format and store a numpy cache - to speed up the loading.\n + to speed up the loading. - "dummy" will initialize the weights with random values, which is mainly - for profiling.\n + for profiling. - "tensorizer" will use CoreWeave's tensorizer library for fast weight - loading. See the Tensorize vLLM Model script in the Examples section for - more information.\n + loading. See the Tensorize vLLM Model script in the Examples section for + more information. - "runai_streamer" will load the Safetensors weights using Run:ai Model - Streamer.\n + Streamer. - "runai_streamer_sharded" will load weights from pre-sharded checkpoint - files using Run:ai Model Streamer.\n - - "bitsandbytes" will load the weights using bitsandbytes quantization.\n + files using Run:ai Model Streamer. + - "bitsandbytes" will load the weights using bitsandbytes quantization. - "sharded_state" will load weights from pre-sharded checkpoint files, - supporting efficient loading of tensor-parallel models.\n + supporting efficient loading of tensor-parallel models. - "gguf" will load weights from GGUF format files (details specified in - https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n + https://github.com/ggml-org/ggml/blob/master/docs/gguf.md). - "mistral" will load weights from consolidated safetensors files used by - Mistral models.\n - - Other custom values can be supported via plugins.""" + Mistral models.\n + - Other custom values can be supported via plugins. + """ download_dir: str | None = None """Directory to download and load the weights, default to the default cache directory of Hugging Face.""" safetensors_load_strategy: str | None = None - """Specifies the loading strategy for safetensors weights. + """ + Specifies the loading strategy for safetensors weights. + - None (default): Uses memory-mapped (lazy) loading. When an NFS filesystem is detected and the total checkpoint size fits within 90%% of available RAM, prefetching is enabled automatically. @@ -72,7 +77,7 @@ class LoadConfig: - "torchao": Weights are loaded in upfront and then reconstructed into torchao tensor subclasses. This is used when the checkpoint was quantized using torchao and saved using safetensors. - Needs torchao >= 0.14.0 + Needs `torchao >= 0.14.0`. """ model_loader_extra_config: dict | TensorizerConfig = Field(default_factory=dict) """Extra config for model loader. This will be passed to the model loader @@ -88,13 +93,13 @@ class LoadConfig: weights.""" pt_load_map_location: str | dict[str, str] = "cpu" """ - pt_load_map_location: the map location for loading pytorch checkpoint, to - support loading checkpoints can only be loaded on certain devices like - "cuda", this is equivalent to {"": "cuda"}. Another supported format is - mapping from different devices like from GPU 1 to GPU 0: - {"cuda:1": "cuda:0"}. Note that when passed from command line, the strings - in dictionary needs to be double quoted for json parsing. For more details, - see original doc for `map_location` in https://pytorch.org/docs/stable/generated/torch.load.html + The map location for loading pytorch checkpoint, to support loading + checkpoints can only be loaded on certain devices like "cuda", this + is equivalent to `{"": "cuda"}`. Another supported format is mapping + from different devices like from GPU 1 to GPU 0: `{"cuda:1": "cuda:0"}`. + Note that when passed from command line, the strings in dictionary + need to be double quoted for json parsing. For more details, see + the original doc for `map_location` parameter in [`torch.load`][] parameter. """ def compute_hash(self) -> str: