DOC: Documentation pages fixes (#38125)

Signed-off-by: Mateusz Sokół <mat646@gmail.com>
This commit is contained in:
Mateusz Sokół
2026-03-26 09:55:42 +01:00
committed by GitHub
parent 6ae8bbd0c2
commit b1cb1d3d2c

View File

@@ -24,38 +24,43 @@ class LoadConfig:
"""Configuration for loading the model weights."""
load_format: str | LoadFormats = "auto"
"""The format of the model weights to load:\n
"""
The format of the model weights to load.
- "auto" will try to load the weights in the safetensors format and fall
back to the pytorch bin format if safetensors format is not available.\n
- "pt" will load the weights in the pytorch bin format.\n
- "safetensors" will load the weights in the safetensors format.\n
back to the pytorch bin format if safetensors format is not available.
- "pt" will load the weights in the pytorch bin format.
- "safetensors" will load the weights in the safetensors format.
- "instanttensor" will load the Safetensors weights on CUDA devices using
InstantTensor, which enables distributed loading with pipelined prefetching
and fast direct I/O.\n
InstantTensor, which enables distributed loading with pipelined prefetching
and fast direct I/O.
- "npcache" will load the weights in pytorch format and store a numpy cache
to speed up the loading.\n
to speed up the loading.
- "dummy" will initialize the weights with random values, which is mainly
for profiling.\n
for profiling.
- "tensorizer" will use CoreWeave's tensorizer library for fast weight
loading. See the Tensorize vLLM Model script in the Examples section for
more information.\n
loading. See the Tensorize vLLM Model script in the Examples section for
more information.
- "runai_streamer" will load the Safetensors weights using Run:ai Model
Streamer.\n
Streamer.
- "runai_streamer_sharded" will load weights from pre-sharded checkpoint
files using Run:ai Model Streamer.\n
- "bitsandbytes" will load the weights using bitsandbytes quantization.\n
files using Run:ai Model Streamer.
- "bitsandbytes" will load the weights using bitsandbytes quantization.
- "sharded_state" will load weights from pre-sharded checkpoint files,
supporting efficient loading of tensor-parallel models.\n
supporting efficient loading of tensor-parallel models.
- "gguf" will load weights from GGUF format files (details specified in
https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n
https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).
- "mistral" will load weights from consolidated safetensors files used by
Mistral models.\n
- Other custom values can be supported via plugins."""
Mistral models.\n
- Other custom values can be supported via plugins.
"""
download_dir: str | None = None
"""Directory to download and load the weights, default to the default
cache directory of Hugging Face."""
safetensors_load_strategy: str | None = None
"""Specifies the loading strategy for safetensors weights.
"""
Specifies the loading strategy for safetensors weights.
- None (default): Uses memory-mapped (lazy) loading. When an NFS
filesystem is detected and the total checkpoint size fits within 90%%
of available RAM, prefetching is enabled automatically.
@@ -72,7 +77,7 @@ class LoadConfig:
- "torchao": Weights are loaded in upfront and then reconstructed
into torchao tensor subclasses. This is used when the checkpoint
was quantized using torchao and saved using safetensors.
Needs torchao >= 0.14.0
Needs `torchao >= 0.14.0`.
"""
model_loader_extra_config: dict | TensorizerConfig = Field(default_factory=dict)
"""Extra config for model loader. This will be passed to the model loader
@@ -88,13 +93,13 @@ class LoadConfig:
weights."""
pt_load_map_location: str | dict[str, str] = "cpu"
"""
pt_load_map_location: the map location for loading pytorch checkpoint, to
support loading checkpoints can only be loaded on certain devices like
"cuda", this is equivalent to {"": "cuda"}. Another supported format is
mapping from different devices like from GPU 1 to GPU 0:
{"cuda:1": "cuda:0"}. Note that when passed from command line, the strings
in dictionary needs to be double quoted for json parsing. For more details,
see original doc for `map_location` in https://pytorch.org/docs/stable/generated/torch.load.html
The map location for loading pytorch checkpoint, to support loading
checkpoints can only be loaded on certain devices like "cuda", this
is equivalent to `{"": "cuda"}`. Another supported format is mapping
from different devices like from GPU 1 to GPU 0: `{"cuda:1": "cuda:0"}`.
Note that when passed from command line, the strings in dictionary
need to be double quoted for json parsing. For more details, see
the original doc for `map_location` parameter in [`torch.load`][] parameter.
"""
def compute_hash(self) -> str: