DOC: Documentation pages fixes (#38125)
Signed-off-by: Mateusz Sokół <mat646@gmail.com>
This commit is contained in:
@@ -24,38 +24,43 @@ class LoadConfig:
|
||||
"""Configuration for loading the model weights."""
|
||||
|
||||
load_format: str | LoadFormats = "auto"
|
||||
"""The format of the model weights to load:\n
|
||||
"""
|
||||
The format of the model weights to load.
|
||||
|
||||
- "auto" will try to load the weights in the safetensors format and fall
|
||||
back to the pytorch bin format if safetensors format is not available.\n
|
||||
- "pt" will load the weights in the pytorch bin format.\n
|
||||
- "safetensors" will load the weights in the safetensors format.\n
|
||||
back to the pytorch bin format if safetensors format is not available.
|
||||
- "pt" will load the weights in the pytorch bin format.
|
||||
- "safetensors" will load the weights in the safetensors format.
|
||||
- "instanttensor" will load the Safetensors weights on CUDA devices using
|
||||
InstantTensor, which enables distributed loading with pipelined prefetching
|
||||
and fast direct I/O.\n
|
||||
InstantTensor, which enables distributed loading with pipelined prefetching
|
||||
and fast direct I/O.
|
||||
- "npcache" will load the weights in pytorch format and store a numpy cache
|
||||
to speed up the loading.\n
|
||||
to speed up the loading.
|
||||
- "dummy" will initialize the weights with random values, which is mainly
|
||||
for profiling.\n
|
||||
for profiling.
|
||||
- "tensorizer" will use CoreWeave's tensorizer library for fast weight
|
||||
loading. See the Tensorize vLLM Model script in the Examples section for
|
||||
more information.\n
|
||||
loading. See the Tensorize vLLM Model script in the Examples section for
|
||||
more information.
|
||||
- "runai_streamer" will load the Safetensors weights using Run:ai Model
|
||||
Streamer.\n
|
||||
Streamer.
|
||||
- "runai_streamer_sharded" will load weights from pre-sharded checkpoint
|
||||
files using Run:ai Model Streamer.\n
|
||||
- "bitsandbytes" will load the weights using bitsandbytes quantization.\n
|
||||
files using Run:ai Model Streamer.
|
||||
- "bitsandbytes" will load the weights using bitsandbytes quantization.
|
||||
- "sharded_state" will load weights from pre-sharded checkpoint files,
|
||||
supporting efficient loading of tensor-parallel models.\n
|
||||
supporting efficient loading of tensor-parallel models.
|
||||
- "gguf" will load weights from GGUF format files (details specified in
|
||||
https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n
|
||||
https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).
|
||||
- "mistral" will load weights from consolidated safetensors files used by
|
||||
Mistral models.\n
|
||||
- Other custom values can be supported via plugins."""
|
||||
Mistral models.\n
|
||||
- Other custom values can be supported via plugins.
|
||||
"""
|
||||
download_dir: str | None = None
|
||||
"""Directory to download and load the weights, default to the default
|
||||
cache directory of Hugging Face."""
|
||||
safetensors_load_strategy: str | None = None
|
||||
"""Specifies the loading strategy for safetensors weights.
|
||||
"""
|
||||
Specifies the loading strategy for safetensors weights.
|
||||
|
||||
- None (default): Uses memory-mapped (lazy) loading. When an NFS
|
||||
filesystem is detected and the total checkpoint size fits within 90%%
|
||||
of available RAM, prefetching is enabled automatically.
|
||||
@@ -72,7 +77,7 @@ class LoadConfig:
|
||||
- "torchao": Weights are loaded in upfront and then reconstructed
|
||||
into torchao tensor subclasses. This is used when the checkpoint
|
||||
was quantized using torchao and saved using safetensors.
|
||||
Needs torchao >= 0.14.0
|
||||
Needs `torchao >= 0.14.0`.
|
||||
"""
|
||||
model_loader_extra_config: dict | TensorizerConfig = Field(default_factory=dict)
|
||||
"""Extra config for model loader. This will be passed to the model loader
|
||||
@@ -88,13 +93,13 @@ class LoadConfig:
|
||||
weights."""
|
||||
pt_load_map_location: str | dict[str, str] = "cpu"
|
||||
"""
|
||||
pt_load_map_location: the map location for loading pytorch checkpoint, to
|
||||
support loading checkpoints can only be loaded on certain devices like
|
||||
"cuda", this is equivalent to {"": "cuda"}. Another supported format is
|
||||
mapping from different devices like from GPU 1 to GPU 0:
|
||||
{"cuda:1": "cuda:0"}. Note that when passed from command line, the strings
|
||||
in dictionary needs to be double quoted for json parsing. For more details,
|
||||
see original doc for `map_location` in https://pytorch.org/docs/stable/generated/torch.load.html
|
||||
The map location for loading pytorch checkpoint, to support loading
|
||||
checkpoints can only be loaded on certain devices like "cuda", this
|
||||
is equivalent to `{"": "cuda"}`. Another supported format is mapping
|
||||
from different devices like from GPU 1 to GPU 0: `{"cuda:1": "cuda:0"}`.
|
||||
Note that when passed from command line, the strings in dictionary
|
||||
need to be double quoted for json parsing. For more details, see
|
||||
the original doc for `map_location` parameter in [`torch.load`][] parameter.
|
||||
"""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
|
||||
Reference in New Issue
Block a user