Update Optional[x] -> x | None and Union[x, y] to x | y (#26633)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -3,7 +3,7 @@
|
||||
|
||||
import hashlib
|
||||
from dataclasses import field
|
||||
from typing import TYPE_CHECKING, Any, Literal, Optional
|
||||
from typing import TYPE_CHECKING, Any, Literal
|
||||
|
||||
from pydantic import Field, SkipValidation, field_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
@@ -58,13 +58,13 @@ class CacheConfig:
|
||||
is_attention_free: bool = False
|
||||
"""Whether the model is attention-free. This is primarily set in
|
||||
`ModelConfig` and that value should be manually duplicated here."""
|
||||
num_gpu_blocks_override: Optional[int] = None
|
||||
num_gpu_blocks_override: int | None = None
|
||||
"""Number of GPU blocks to use. This overrides the profiled `num_gpu_blocks`
|
||||
if specified. Does nothing if `None`. Used for testing preemption."""
|
||||
sliding_window: Optional[int] = None
|
||||
sliding_window: int | None = None
|
||||
"""Sliding window size for the KV cache. This is primarily set in
|
||||
`ModelConfig` and that value should be manually duplicated here."""
|
||||
enable_prefix_caching: Optional[bool] = None
|
||||
enable_prefix_caching: bool | None = None
|
||||
"""Whether to enable prefix caching. Enabled by default for V1."""
|
||||
prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
|
||||
"""Set the hash algorithm for prefix caching:\n
|
||||
@@ -84,12 +84,12 @@ class CacheConfig:
|
||||
"""This enables dynamic calculation of `k_scale` and `v_scale` when
|
||||
kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model
|
||||
checkpoint if available. Otherwise, the scales will default to 1.0."""
|
||||
cpu_kvcache_space_bytes: Optional[int] = None
|
||||
cpu_kvcache_space_bytes: int | None = None
|
||||
"""(CPU backend only) CPU key-value cache space."""
|
||||
mamba_page_size_padded: Optional[int] = None
|
||||
mamba_page_size_padded: int | None = None
|
||||
""" Optional override for mamba page size; used by hybrid mamba/attention
|
||||
models to ensure exact alignment with attention page size."""
|
||||
mamba_block_size: Optional[int] = None
|
||||
mamba_block_size: int | None = None
|
||||
"""Size of a contiguous cache block in number of tokens for mamba cache."""
|
||||
mamba_cache_dtype: MambaDType = "auto"
|
||||
"""The data type to use for the Mamba cache (both the conv as well as the
|
||||
@@ -101,9 +101,9 @@ class CacheConfig:
|
||||
for the ssm state will be determined by mamba_cache_dtype."""
|
||||
|
||||
# Will be set after profiling.
|
||||
num_gpu_blocks: Optional[int] = field(default=None, init=False)
|
||||
num_gpu_blocks: int | None = field(default=None, init=False)
|
||||
"""The number of blocks to allocate for GPU memory."""
|
||||
num_cpu_blocks: Optional[int] = field(default=None, init=False)
|
||||
num_cpu_blocks: int | None = field(default=None, init=False)
|
||||
"""The number of blocks to allocate for CPU memory."""
|
||||
|
||||
kv_sharing_fast_prefill: bool = False
|
||||
@@ -116,7 +116,7 @@ class CacheConfig:
|
||||
necessary for implementing this optimization in some models (e.g. Gemma3n)
|
||||
"""
|
||||
|
||||
kv_cache_memory_bytes: Optional[int] = None
|
||||
kv_cache_memory_bytes: int | None = None
|
||||
"""Size of KV Cache per GPU in bytes. By default, this is set to None
|
||||
and vllm can automatically infer the kv cache size based on
|
||||
gpu_memory_utilization. However, users may want to manually specify
|
||||
|
||||
@@ -4,9 +4,10 @@
|
||||
import enum
|
||||
import hashlib
|
||||
from collections import Counter
|
||||
from collections.abc import Callable
|
||||
from dataclasses import asdict, field
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any, ClassVar
|
||||
|
||||
from pydantic import TypeAdapter, field_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
@@ -168,7 +169,7 @@ class CompilationConfig:
|
||||
"""
|
||||
|
||||
# Top-level Compilation control
|
||||
level: Optional[int] = None
|
||||
level: int | None = None
|
||||
"""The level of compilation:
|
||||
|
||||
- None: If None, we will select the default compilation level.
|
||||
@@ -177,7 +178,7 @@ class CompilationConfig:
|
||||
- 1: dynamo as is.
|
||||
- 2: dynamo once.
|
||||
- 3: piecewise compilation."""
|
||||
debug_dump_path: Optional[Path] = None
|
||||
debug_dump_path: Path | None = None
|
||||
"""The path to dump the debug information."""
|
||||
cache_dir: str = ""
|
||||
"""The directory to store the compiled graph, to accelerate Inductor
|
||||
@@ -208,7 +209,7 @@ class CompilationConfig:
|
||||
By default, all custom ops are enabled when running without Inductor and
|
||||
disabled when running with Inductor: level>=PIECEWISE and use_inductor=True.
|
||||
Inductor generates (fused) Triton kernels for disabled custom ops."""
|
||||
splitting_ops: Optional[list[str]] = None
|
||||
splitting_ops: list[str] | None = None
|
||||
"""A list of ops to exclude from cudagraphs, used in piecewise compilation.
|
||||
|
||||
The behavior depends on use_inductor_graph_partition:
|
||||
@@ -238,7 +239,7 @@ class CompilationConfig:
|
||||
are compiled using configurations in inductor_compile_config.
|
||||
|
||||
This setting is ignored if level<PIECEWISE."""
|
||||
compile_sizes: Optional[list[Union[int, str]]] = None
|
||||
compile_sizes: list[int | str] | None = None
|
||||
"""Sizes to compile for inductor. In addition
|
||||
to integers, it also supports "cudagraph_capture_sizes" to
|
||||
specify the sizes for cudagraph capture."""
|
||||
@@ -253,7 +254,7 @@ class CompilationConfig:
|
||||
constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
|
||||
|
||||
# CudaGraph compilation
|
||||
cudagraph_mode: Optional[CUDAGraphMode] = None
|
||||
cudagraph_mode: CUDAGraphMode | None = None
|
||||
"""
|
||||
The mode of the cudagraph:
|
||||
|
||||
@@ -308,7 +309,7 @@ class CompilationConfig:
|
||||
It means the first several runs will be treated as warmup runs.
|
||||
Only after that, the execution will be recorded, and the recorded
|
||||
cudagraph will be used for subsequent runs."""
|
||||
cudagraph_capture_sizes: Optional[list[int]] = None
|
||||
cudagraph_capture_sizes: list[int] | None = None
|
||||
"""Sizes to capture cudagraph.
|
||||
- None (default): capture sizes are inferred from vllm config.
|
||||
- list[int]: capture sizes are specified as given."""
|
||||
@@ -320,7 +321,7 @@ class CompilationConfig:
|
||||
internally managed buffer. Default is False.
|
||||
Note that this flag is only effective when cudagraph_mode is PIECEWISE.
|
||||
"""
|
||||
full_cuda_graph: Optional[bool] = False
|
||||
full_cuda_graph: bool | None = False
|
||||
"""whether to use a full cuda graph for the entire forward pass rather than
|
||||
splitting certain operations such as attention into subgraphs. Thus this
|
||||
flag cannot be used together with splitting_ops. This may provide
|
||||
@@ -544,7 +545,7 @@ class CompilationConfig:
|
||||
"(where 'op' is the registered op name)"
|
||||
)
|
||||
|
||||
def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
|
||||
def init_backend(self, vllm_config: "VllmConfig") -> str | Callable:
|
||||
if self.level == CompilationLevel.NO_COMPILATION:
|
||||
raise ValueError("No compilation level is set.")
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
import hashlib
|
||||
from dataclasses import field
|
||||
from typing import Any, Literal, Optional, Union
|
||||
from typing import Any, Literal
|
||||
|
||||
import torch
|
||||
from pydantic import ConfigDict, SkipValidation
|
||||
@@ -19,7 +19,7 @@ Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]
|
||||
class DeviceConfig:
|
||||
"""Configuration for the device to use for vLLM execution."""
|
||||
|
||||
device: SkipValidation[Optional[Union[Device, torch.device]]] = "auto"
|
||||
device: SkipValidation[Device | torch.device | None] = "auto"
|
||||
"""Device type for vLLM execution.
|
||||
This parameter is deprecated and will be
|
||||
removed in a future release.
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
@@ -26,7 +25,7 @@ class KVEventsConfig:
|
||||
"""The zmq endpoint to use for publishing kv events.
|
||||
"""
|
||||
|
||||
replay_endpoint: Optional[str] = None
|
||||
replay_endpoint: str | None = None
|
||||
"""The zmq endpoint to use for replaying kv events.
|
||||
"""
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
import hashlib
|
||||
import uuid
|
||||
from dataclasses import field
|
||||
from typing import Any, Literal, Optional, get_args
|
||||
from typing import Any, Literal, get_args
|
||||
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
@@ -20,14 +20,14 @@ KVRole = Literal[KVProducer, KVConsumer]
|
||||
class KVTransferConfig:
|
||||
"""Configuration for distributed KV cache transfer."""
|
||||
|
||||
kv_connector: Optional[str] = None
|
||||
kv_connector: str | None = None
|
||||
"""The KV connector for vLLM to transmit KV caches between vLLM instances.
|
||||
"""
|
||||
|
||||
engine_id: Optional[str] = None
|
||||
engine_id: str | None = None
|
||||
"""The engine id for KV transfers."""
|
||||
|
||||
kv_buffer_device: Optional[str] = "cuda"
|
||||
kv_buffer_device: str | None = "cuda"
|
||||
"""The device used by kv connector to buffer the KV cache. Choices are
|
||||
'cuda' and 'cpu'."""
|
||||
|
||||
@@ -35,11 +35,11 @@ class KVTransferConfig:
|
||||
"""The buffer size for TorchDistributedConnector. Measured in number of
|
||||
bytes. Recommended value: 1e9 (about 1GB)."""
|
||||
|
||||
kv_role: Optional[KVRole] = None
|
||||
kv_role: KVRole | None = None
|
||||
"""Whether this vLLM instance produces, consumes KV cache, or both. Choices
|
||||
are 'kv_producer', 'kv_consumer', and 'kv_both'."""
|
||||
|
||||
kv_rank: Optional[int] = None
|
||||
kv_rank: int | None = None
|
||||
"""The rank of this vLLM instance in the KV cache transfer. Typical value:
|
||||
0 for prefill instance, 1 for decode instance.
|
||||
Currently only 1P1D is supported."""
|
||||
@@ -57,7 +57,7 @@ class KVTransferConfig:
|
||||
kv_connector_extra_config: dict[str, Any] = field(default_factory=dict)
|
||||
"""any extra config that the connector may need."""
|
||||
|
||||
kv_connector_module_path: Optional[str] = None
|
||||
kv_connector_module_path: str | None = None
|
||||
"""The Python module path to dynamically load the KV connector from.
|
||||
Only supported in V1."""
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import hashlib
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from pydantic import Field, field_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
@@ -25,7 +25,7 @@ logger = init_logger(__name__)
|
||||
class LoadConfig:
|
||||
"""Configuration for loading the model weights."""
|
||||
|
||||
load_format: Union[str, LoadFormats] = "auto"
|
||||
load_format: str | LoadFormats = "auto"
|
||||
"""The format of the model weights to load:\n
|
||||
- "auto" will try to load the weights in the safetensors format and fall
|
||||
back to the pytorch bin format if safetensors format is not available.\n
|
||||
@@ -48,7 +48,7 @@ class LoadConfig:
|
||||
- "mistral" will load weights from consolidated safetensors files used by
|
||||
Mistral models.
|
||||
- Other custom values can be supported via plugins."""
|
||||
download_dir: Optional[str] = None
|
||||
download_dir: str | None = None
|
||||
"""Directory to download and load the weights, default to the default
|
||||
cache directory of Hugging Face."""
|
||||
safetensors_load_strategy: str = "lazy"
|
||||
@@ -64,23 +64,19 @@ class LoadConfig:
|
||||
was quantized using torchao and saved using safetensors.
|
||||
Needs torchao >= 0.14.0
|
||||
"""
|
||||
model_loader_extra_config: Union[dict, TensorizerConfig] = Field(
|
||||
default_factory=dict
|
||||
)
|
||||
model_loader_extra_config: dict | TensorizerConfig = Field(default_factory=dict)
|
||||
"""Extra config for model loader. This will be passed to the model loader
|
||||
corresponding to the chosen load_format."""
|
||||
device: Optional[str] = None
|
||||
device: str | None = None
|
||||
"""Device to which model weights will be loaded, default to
|
||||
device_config.device"""
|
||||
ignore_patterns: Union[list[str], str] = Field(
|
||||
default_factory=lambda: ["original/**/*"]
|
||||
)
|
||||
ignore_patterns: list[str] | str = Field(default_factory=lambda: ["original/**/*"])
|
||||
"""The list of patterns to ignore when loading the model. Default to
|
||||
"original/**/*" to avoid repeated loading of llama's checkpoints."""
|
||||
use_tqdm_on_load: bool = True
|
||||
"""Whether to enable tqdm for showing progress bar when loading model
|
||||
weights."""
|
||||
pt_load_map_location: Union[str, dict[str, str]] = "cpu"
|
||||
pt_load_map_location: str | dict[str, str] = "cpu"
|
||||
"""
|
||||
pt_load_map_location: the map location for loading pytorch checkpoint, to
|
||||
support loading checkpoints can only be loaded on certain devices like
|
||||
@@ -115,8 +111,8 @@ class LoadConfig:
|
||||
|
||||
@field_validator("ignore_patterns", mode="after")
|
||||
def _validate_ignore_patterns(
|
||||
cls, ignore_patterns: Union[list[str], str]
|
||||
) -> Union[list[str], str]:
|
||||
cls, ignore_patterns: list[str] | str
|
||||
) -> list[str] | str:
|
||||
if ignore_patterns != ["original/**/*"] and len(ignore_patterns) > 0:
|
||||
logger.info(
|
||||
"Ignoring the following patterns when downloading weights: %s",
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import hashlib
|
||||
from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any, ClassVar, Literal
|
||||
|
||||
import torch
|
||||
from pydantic import ConfigDict, Field, model_validator
|
||||
@@ -42,10 +42,10 @@ class LoRAConfig:
|
||||
parallelism. Enabling this will use the fully sharded layers. At high
|
||||
sequence length, max rank or tensor parallel size, this is likely faster.
|
||||
"""
|
||||
max_cpu_loras: Optional[int] = None
|
||||
max_cpu_loras: int | None = None
|
||||
"""Maximum number of LoRAs to store in CPU memory. Must be >= than
|
||||
`max_loras`."""
|
||||
lora_dtype: Union[torch.dtype, LoRADType] = "auto"
|
||||
lora_dtype: torch.dtype | LoRADType = "auto"
|
||||
"""Data type for LoRA. If auto, will default to base model dtype."""
|
||||
lora_extra_vocab_size: LoRAExtraVocabSize = Field(
|
||||
default=256,
|
||||
@@ -60,7 +60,7 @@ class LoRAConfig:
|
||||
lora_vocab_padding_size: ClassVar[int] = (
|
||||
current_platform.get_lora_vocab_padding_size()
|
||||
)
|
||||
default_mm_loras: Optional[dict[str, str]] = None
|
||||
default_mm_loras: dict[str, str] | None = None
|
||||
"""Dictionary mapping specific modalities to LoRA model paths; this field
|
||||
is only applicable to multimodal models and should be leveraged when a
|
||||
model always expects a LoRA to be active when a given modality is present.
|
||||
|
||||
@@ -4,18 +4,10 @@
|
||||
import hashlib
|
||||
import json
|
||||
import warnings
|
||||
from collections.abc import Callable
|
||||
from dataclasses import InitVar, field
|
||||
from importlib.util import find_spec
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
Literal,
|
||||
Optional,
|
||||
Union,
|
||||
cast,
|
||||
get_args,
|
||||
)
|
||||
from typing import TYPE_CHECKING, Any, Literal, cast, get_args
|
||||
|
||||
import torch
|
||||
from pydantic import ConfigDict, SkipValidation, field_validator, model_validator
|
||||
@@ -89,7 +81,7 @@ ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
|
||||
LogprobsMode = Literal[
|
||||
"raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
|
||||
]
|
||||
HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig], PretrainedConfig]]
|
||||
HfOverrides = dict[str, Any] | Callable[[PretrainedConfig], PretrainedConfig]
|
||||
ModelImpl = Literal["auto", "vllm", "transformers", "terratorch"]
|
||||
|
||||
_RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = {
|
||||
@@ -121,7 +113,7 @@ class ModelConfig:
|
||||
"""Convert the model using adapters defined in
|
||||
[vllm.model_executor.models.adapters][]. The most common use case is to
|
||||
adapt a text generation model to be used for pooling tasks."""
|
||||
task: Optional[TaskOption] = None
|
||||
task: TaskOption | None = None
|
||||
"""[DEPRECATED] The task to use the model for. If the model supports more
|
||||
than one model runner, this is used to select which model runner to run.
|
||||
|
||||
@@ -139,7 +131,7 @@ class ModelConfig:
|
||||
trust_remote_code: bool = False
|
||||
"""Trust remote code (e.g., from HuggingFace) when downloading the model
|
||||
and tokenizer."""
|
||||
dtype: Union[ModelDType, torch.dtype] = "auto"
|
||||
dtype: ModelDType | torch.dtype = "auto"
|
||||
"""Data type for model weights and activations:\n
|
||||
- "auto" will use FP16 precision for FP32 and FP16 models, and BF16
|
||||
precision for BF16 models.\n
|
||||
@@ -148,33 +140,33 @@ class ModelConfig:
|
||||
- "bfloat16" for a balance between precision and range.\n
|
||||
- "float" is shorthand for FP32 precision.\n
|
||||
- "float32" for FP32 precision."""
|
||||
seed: Optional[int] = None
|
||||
seed: int | None = None
|
||||
"""Random seed for reproducibility. Initialized to None in V0, but
|
||||
initialized to 0 in V1."""
|
||||
hf_config_path: Optional[str] = None
|
||||
hf_config_path: str | None = None
|
||||
"""Name or path of the Hugging Face config to use. If unspecified, model
|
||||
name or path will be used."""
|
||||
allowed_local_media_path: str = ""
|
||||
"""Allowing API requests to read local images or videos from directories
|
||||
specified by the server file system. This is a security risk. Should only
|
||||
be enabled in trusted environments."""
|
||||
allowed_media_domains: Optional[list[str]] = None
|
||||
allowed_media_domains: list[str] | None = None
|
||||
"""If set, only media URLs that belong to this domain can be used for
|
||||
multi-modal inputs. """
|
||||
revision: Optional[str] = None
|
||||
revision: str | None = None
|
||||
"""The specific model version to use. It can be a branch name, a tag name,
|
||||
or a commit id. If unspecified, will use the default version."""
|
||||
code_revision: Optional[str] = None
|
||||
code_revision: str | None = None
|
||||
"""The specific revision to use for the model code on the Hugging Face Hub.
|
||||
It can be a branch name, a tag name, or a commit id. If unspecified, will
|
||||
use the default version."""
|
||||
rope_scaling: dict[str, Any] = field(default_factory=dict)
|
||||
"""RoPE scaling configuration. For example,
|
||||
`{"rope_type":"dynamic","factor":2.0}`."""
|
||||
rope_theta: Optional[float] = None
|
||||
rope_theta: float | None = None
|
||||
"""RoPE theta. Use with `rope_scaling`. In some cases, changing the RoPE
|
||||
theta improves the performance of the scaled model."""
|
||||
tokenizer_revision: Optional[str] = None
|
||||
tokenizer_revision: str | None = None
|
||||
"""The specific revision to use for the tokenizer on the Hugging Face Hub.
|
||||
It can be a branch name, a tag name, or a commit id. If unspecified, will
|
||||
use the default version."""
|
||||
@@ -187,9 +179,9 @@ class ModelConfig:
|
||||
- 1k -> 1000\n
|
||||
- 1K -> 1024\n
|
||||
- 25.6k -> 25,600"""
|
||||
spec_target_max_model_len: Optional[int] = None
|
||||
spec_target_max_model_len: int | None = None
|
||||
"""Specify the maximum length for spec decoding draft models."""
|
||||
quantization: SkipValidation[Optional[QuantizationMethods]] = None
|
||||
quantization: SkipValidation[QuantizationMethods | None] = None
|
||||
"""Method used to quantize the weights. If `None`, we first check the
|
||||
`quantization_config` attribute in the model config file. If that is
|
||||
`None`, we assume the model weights are not quantized and use `dtype` to
|
||||
@@ -230,7 +222,7 @@ class ModelConfig:
|
||||
"""If `True`, enables passing text embeddings as inputs via the
|
||||
`prompt_embeds` key. Note that enabling this will double the time required
|
||||
for graph compilation."""
|
||||
served_model_name: Optional[Union[str, list[str]]] = None
|
||||
served_model_name: str | list[str] | None = None
|
||||
"""The model name(s) used in the API. If multiple names are provided, the
|
||||
server will respond to any of the provided names. The model name in the
|
||||
model field of a response will be the first name in this list. If not
|
||||
@@ -238,20 +230,20 @@ class ModelConfig:
|
||||
that this name(s) will also be used in `model_name` tag content of
|
||||
prometheus metrics, if multiple names provided, metrics tag will take the
|
||||
first one."""
|
||||
config_format: Union[str, ConfigFormat] = "auto"
|
||||
config_format: str | ConfigFormat = "auto"
|
||||
"""The format of the model config to load:\n
|
||||
- "auto" will try to load the config in hf format if available else it
|
||||
will try to load in mistral format.\n
|
||||
- "hf" will load the config in hf format.\n
|
||||
- "mistral" will load the config in mistral format."""
|
||||
hf_token: Optional[Union[bool, str]] = None
|
||||
hf_token: bool | str | None = None
|
||||
"""The token to use as HTTP bearer authorization for remote files . If
|
||||
`True`, will use the token generated when running `huggingface-cli login`
|
||||
(stored in `~/.huggingface`)."""
|
||||
hf_overrides: HfOverrides = field(default_factory=dict)
|
||||
"""If a dictionary, contains arguments to be forwarded to the Hugging Face
|
||||
config. If a callable, it is called to update the HuggingFace config."""
|
||||
logits_processor_pattern: Optional[str] = None
|
||||
logits_processor_pattern: str | None = None
|
||||
"""Optional regex pattern specifying valid logits processor qualified names
|
||||
that can be passed with the `logits_processors` extra completion argument.
|
||||
Defaults to `None`, which allows no processors."""
|
||||
@@ -269,7 +261,7 @@ class ModelConfig:
|
||||
`--generation-config vllm`, only the override parameters are used."""
|
||||
enable_sleep_mode: bool = False
|
||||
"""Enable sleep mode for the engine (only cuda platform is supported)."""
|
||||
model_impl: Union[str, ModelImpl] = "auto"
|
||||
model_impl: str | ModelImpl = "auto"
|
||||
"""Which implementation of the model to use:\n
|
||||
- "auto" will try to use the vLLM implementation, if it exists, and fall
|
||||
back to the Transformers implementation if no vLLM implementation is
|
||||
@@ -278,36 +270,36 @@ class ModelConfig:
|
||||
- "transformers" will use the Transformers model implementation.\n
|
||||
- "terratorch" will use the TerraTorch model implementation.
|
||||
"""
|
||||
override_attention_dtype: Optional[str] = None
|
||||
override_attention_dtype: str | None = None
|
||||
"""Override dtype for attention"""
|
||||
logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None
|
||||
logits_processors: list[str | type[LogitsProcessor]] | None = None
|
||||
"""One or more logits processors' fully-qualified class names or class
|
||||
definitions"""
|
||||
io_processor_plugin: Optional[str] = None
|
||||
io_processor_plugin: str | None = None
|
||||
"""IOProcessor plugin name to load at model startup"""
|
||||
|
||||
# Pooler config
|
||||
pooler_config: Optional[PoolerConfig] = None
|
||||
pooler_config: PoolerConfig | None = None
|
||||
"""Pooler config which controls the behaviour of output pooling in pooling
|
||||
models."""
|
||||
override_pooler_config: Optional[Union[dict, PoolerConfig]] = None
|
||||
override_pooler_config: dict | PoolerConfig | None = None
|
||||
"""[DEPRECATED] Use `pooler_config` instead. This field will be removed in
|
||||
v0.12.0 or v1.0.0, whichever is sooner."""
|
||||
|
||||
# Multimodal config and init vars
|
||||
multimodal_config: Optional[MultiModalConfig] = None
|
||||
multimodal_config: MultiModalConfig | None = None
|
||||
"""Configuration for multimodal model. If `None`, this will be inferred
|
||||
from the architecture of `self.model`."""
|
||||
limit_mm_per_prompt: InitVar[Optional[dict[str, Union[int, dict[str, int]]]]] = None
|
||||
media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None
|
||||
mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None
|
||||
mm_processor_cache_gb: InitVar[Optional[float]] = None
|
||||
mm_processor_cache_type: InitVar[Optional[MMCacheType]] = None
|
||||
mm_shm_cache_max_object_size_mb: InitVar[Optional[int]] = None
|
||||
mm_encoder_tp_mode: InitVar[Optional[MMEncoderTPMode]] = None
|
||||
interleave_mm_strings: InitVar[Optional[bool]] = None
|
||||
skip_mm_profiling: InitVar[Optional[bool]] = None
|
||||
video_pruning_rate: InitVar[Optional[float]] = None
|
||||
limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None
|
||||
media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None
|
||||
mm_processor_kwargs: InitVar[dict[str, Any] | None] = None
|
||||
mm_processor_cache_gb: InitVar[float | None] = None
|
||||
mm_processor_cache_type: InitVar[MMCacheType | None] = None
|
||||
mm_shm_cache_max_object_size_mb: InitVar[int | None] = None
|
||||
mm_encoder_tp_mode: InitVar[MMEncoderTPMode | None] = None
|
||||
interleave_mm_strings: InitVar[bool | None] = None
|
||||
skip_mm_profiling: InitVar[bool | None] = None
|
||||
video_pruning_rate: InitVar[float | None] = None
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
@@ -369,7 +361,7 @@ class ModelConfig:
|
||||
|
||||
def _update_nested(
|
||||
self,
|
||||
target: Union["PretrainedConfig", dict[str, Any]],
|
||||
target: PretrainedConfig | dict[str, Any],
|
||||
updates: dict[str, Any],
|
||||
) -> None:
|
||||
"""Recursively updates a config or dict with nested updates."""
|
||||
@@ -397,7 +389,7 @@ class ModelConfig:
|
||||
|
||||
def _apply_dict_overrides(
|
||||
self,
|
||||
config: "PretrainedConfig",
|
||||
config: PretrainedConfig,
|
||||
overrides: dict[str, Any],
|
||||
) -> None:
|
||||
"""Apply dict overrides, handling both nested configs and dict values."""
|
||||
@@ -415,16 +407,16 @@ class ModelConfig:
|
||||
def __post_init__(
|
||||
self,
|
||||
# Multimodal config init vars
|
||||
limit_mm_per_prompt: Optional[dict[str, int]],
|
||||
media_io_kwargs: Optional[dict[str, dict[str, Any]]],
|
||||
mm_processor_kwargs: Optional[dict[str, Any]],
|
||||
mm_processor_cache_gb: Optional[float],
|
||||
mm_processor_cache_type: Optional[MMCacheType],
|
||||
mm_shm_cache_max_object_size_mb: Optional[int],
|
||||
mm_encoder_tp_mode: Optional[MMEncoderTPMode],
|
||||
interleave_mm_strings: Optional[bool],
|
||||
skip_mm_profiling: Optional[bool],
|
||||
video_pruning_rate: Optional[float],
|
||||
limit_mm_per_prompt: dict[str, int] | None,
|
||||
media_io_kwargs: dict[str, dict[str, Any]] | None,
|
||||
mm_processor_kwargs: dict[str, Any] | None,
|
||||
mm_processor_cache_gb: float | None,
|
||||
mm_processor_cache_type: MMCacheType | None,
|
||||
mm_shm_cache_max_object_size_mb: int | None,
|
||||
mm_encoder_tp_mode: MMEncoderTPMode | None,
|
||||
interleave_mm_strings: bool | None,
|
||||
skip_mm_profiling: bool | None,
|
||||
video_pruning_rate: float | None,
|
||||
) -> None:
|
||||
# Set the default seed to 0 in V1.
|
||||
# NOTE(woosuk): In V0, we set the default seed to None because the
|
||||
@@ -1209,7 +1201,7 @@ class ModelConfig:
|
||||
"Supported models implement the `SupportsPP` interface."
|
||||
)
|
||||
|
||||
def get_sliding_window(self) -> Optional[int]:
|
||||
def get_sliding_window(self) -> int | None:
|
||||
"""Get the sliding window size from the HF text config if present."""
|
||||
return getattr(self.hf_text_config, "sliding_window", None)
|
||||
|
||||
@@ -1479,7 +1471,7 @@ class ModelConfig:
|
||||
f"{block_type.value} layers"
|
||||
)
|
||||
|
||||
def get_mamba_chunk_size(self) -> Optional[int]:
|
||||
def get_mamba_chunk_size(self) -> int | None:
|
||||
"""
|
||||
Returns the mamba chunk size if it exists
|
||||
"""
|
||||
@@ -1715,9 +1707,7 @@ class ModelConfig:
|
||||
return max_model_len
|
||||
|
||||
|
||||
def get_served_model_name(
|
||||
model: str, served_model_name: Optional[Union[str, list[str]]]
|
||||
):
|
||||
def get_served_model_name(model: str, served_model_name: str | list[str] | None):
|
||||
"""
|
||||
If the input is a non-empty list, the first model_name in
|
||||
`served_model_name` is taken.
|
||||
@@ -1761,9 +1751,9 @@ def iter_architecture_defaults():
|
||||
def try_match_architecture_defaults(
|
||||
architecture: str,
|
||||
*,
|
||||
runner_type: Optional[RunnerType] = None,
|
||||
convert_type: Optional[ConvertType] = None,
|
||||
) -> Optional[tuple[str, tuple[RunnerType, ConvertType]]]:
|
||||
runner_type: RunnerType | None = None,
|
||||
convert_type: ConvertType | None = None,
|
||||
) -> tuple[str, tuple[RunnerType, ConvertType]] | None:
|
||||
for suffix, (
|
||||
default_runner_type,
|
||||
default_convert_type,
|
||||
@@ -1817,7 +1807,7 @@ def _find_dtype(
|
||||
model_id: str,
|
||||
config: PretrainedConfig,
|
||||
*,
|
||||
revision: Optional[str],
|
||||
revision: str | None,
|
||||
):
|
||||
# NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
|
||||
# because config.torch_dtype can be None.
|
||||
@@ -1902,10 +1892,10 @@ def _resolve_auto_dtype(
|
||||
def _get_and_verify_dtype(
|
||||
model_id: str,
|
||||
config: PretrainedConfig,
|
||||
dtype: Union[str, torch.dtype],
|
||||
dtype: str | torch.dtype,
|
||||
*,
|
||||
is_pooling_model: bool,
|
||||
revision: Optional[str] = None,
|
||||
revision: str | None = None,
|
||||
) -> torch.dtype:
|
||||
config_dtype = _find_dtype(model_id, config, revision=revision)
|
||||
model_type = config.model_type
|
||||
@@ -1947,7 +1937,7 @@ def _get_and_verify_dtype(
|
||||
def _get_head_dtype(
|
||||
config: PretrainedConfig, dtype: torch.dtype, runner_type: str
|
||||
) -> torch.dtype:
|
||||
head_dtype: Optional[Union[str, torch.dtype]] = getattr(config, "head_dtype", None)
|
||||
head_dtype: str | torch.dtype | None = getattr(config, "head_dtype", None)
|
||||
|
||||
if head_dtype == "model":
|
||||
return dtype
|
||||
@@ -1970,12 +1960,12 @@ def _get_head_dtype(
|
||||
|
||||
def _get_and_verify_max_len(
|
||||
hf_config: PretrainedConfig,
|
||||
tokenizer_config: Optional[dict],
|
||||
max_model_len: Optional[int],
|
||||
tokenizer_config: dict | None,
|
||||
max_model_len: int | None,
|
||||
disable_sliding_window: bool,
|
||||
sliding_window: Optional[int],
|
||||
spec_target_max_model_len: Optional[int] = None,
|
||||
encoder_config: Optional[Any] = None,
|
||||
sliding_window: int | None,
|
||||
spec_target_max_model_len: int | None = None,
|
||||
encoder_config: Any | None = None,
|
||||
) -> int:
|
||||
"""Get and verify the model's maximum length."""
|
||||
derived_max_model_len = float("inf")
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
import hashlib
|
||||
from collections.abc import Mapping
|
||||
from dataclasses import field
|
||||
from typing import Any, Literal, Optional, Union
|
||||
from typing import Any, Literal, TypeAlias
|
||||
|
||||
from pydantic import ConfigDict, Field, field_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
@@ -23,31 +23,31 @@ class BaseDummyOptions:
|
||||
class VideoDummyOptions(BaseDummyOptions):
|
||||
"""Options for generating dummy video data during profiling."""
|
||||
|
||||
num_frames: Optional[int] = Field(None, gt=0)
|
||||
width: Optional[int] = Field(None, gt=0)
|
||||
height: Optional[int] = Field(None, gt=0)
|
||||
num_frames: int | None = Field(None, gt=0)
|
||||
width: int | None = Field(None, gt=0)
|
||||
height: int | None = Field(None, gt=0)
|
||||
|
||||
|
||||
@dataclass(config=ConfigDict(extra="forbid"))
|
||||
class ImageDummyOptions(BaseDummyOptions):
|
||||
"""Options for generating dummy image data during profiling."""
|
||||
|
||||
width: Optional[int] = Field(None, gt=0)
|
||||
height: Optional[int] = Field(None, gt=0)
|
||||
width: int | None = Field(None, gt=0)
|
||||
height: int | None = Field(None, gt=0)
|
||||
|
||||
|
||||
@dataclass(config=ConfigDict(extra="forbid"))
|
||||
class AudioDummyOptions(BaseDummyOptions):
|
||||
"""Options for generating dummy audio data during profiling."""
|
||||
|
||||
length: Optional[int] = Field(None, gt=0)
|
||||
length: int | None = Field(None, gt=0)
|
||||
|
||||
|
||||
MMEncoderTPMode = Literal["weights", "data"]
|
||||
MMCacheType = Literal["shm", "lru"]
|
||||
DummyOptions = Union[
|
||||
BaseDummyOptions, VideoDummyOptions, ImageDummyOptions, AudioDummyOptions
|
||||
]
|
||||
DummyOptions: TypeAlias = (
|
||||
BaseDummyOptions | VideoDummyOptions | ImageDummyOptions | AudioDummyOptions
|
||||
)
|
||||
|
||||
|
||||
@config
|
||||
@@ -75,7 +75,7 @@ class MultiModalConfig:
|
||||
"""Additional args passed to process media inputs, keyed by modalities.
|
||||
For example, to set num_frames for video, set
|
||||
`--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
|
||||
mm_processor_kwargs: Optional[dict[str, object]] = None
|
||||
mm_processor_kwargs: dict[str, object] | None = None
|
||||
"""Arguments to be forwarded to the model's processor for multi-modal data,
|
||||
e.g., image processor. Overrides for the multi-modal processor obtained
|
||||
from `transformers.AutoProcessor.from_pretrained`.
|
||||
@@ -123,7 +123,7 @@ class MultiModalConfig:
|
||||
This reduces engine startup time but shifts the responsibility to users for
|
||||
estimating the peak memory usage of the activation of multimodal encoder and
|
||||
embedding cache."""
|
||||
video_pruning_rate: Optional[float] = None
|
||||
video_pruning_rate: float | None = None
|
||||
"""Sets pruning rate for video pruning via Efficient Video Sampling.
|
||||
Value sits in range [0;1) and determines fraction of media tokens
|
||||
from each video to be pruned.
|
||||
@@ -132,7 +132,7 @@ class MultiModalConfig:
|
||||
@field_validator("limit_per_prompt", mode="before")
|
||||
@classmethod
|
||||
def _validate_limit_per_prompt(
|
||||
cls, value: dict[str, Union[int, dict[str, int]]]
|
||||
cls, value: dict[str, int | dict[str, int]]
|
||||
) -> dict[str, DummyOptions]:
|
||||
for k, v in value.items():
|
||||
# Handle legacy format where only count is specified
|
||||
@@ -179,7 +179,7 @@ class MultiModalConfig:
|
||||
return 999
|
||||
return limit_data.count
|
||||
|
||||
def get_dummy_options(self, modality: str) -> Optional[BaseDummyOptions]:
|
||||
def get_dummy_options(self, modality: str) -> BaseDummyOptions | None:
|
||||
"""
|
||||
Get the configurable dummy data options for a modality.
|
||||
Returns None if no options are configured for this modality.
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
import hashlib
|
||||
from functools import cached_property
|
||||
from typing import Any, Literal, Optional, cast
|
||||
from typing import Any, Literal, cast
|
||||
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
@@ -18,7 +18,7 @@ DetailedTraceModules = Literal["model", "worker", "all"]
|
||||
class ObservabilityConfig:
|
||||
"""Configuration for observability - metrics and tracing."""
|
||||
|
||||
show_hidden_metrics_for_version: Optional[str] = None
|
||||
show_hidden_metrics_for_version: str | None = None
|
||||
"""Enable deprecated Prometheus metrics that have been hidden since the
|
||||
specified version. For example, if a previously deprecated metric has been
|
||||
hidden since the v0.7.0 release, you use
|
||||
@@ -33,10 +33,10 @@ class ObservabilityConfig:
|
||||
return False
|
||||
return version._prev_minor_version_was(self.show_hidden_metrics_for_version)
|
||||
|
||||
otlp_traces_endpoint: Optional[str] = None
|
||||
otlp_traces_endpoint: str | None = None
|
||||
"""Target URL to which OpenTelemetry traces will be sent."""
|
||||
|
||||
collect_detailed_traces: Optional[list[DetailedTraceModules]] = None
|
||||
collect_detailed_traces: list[DetailedTraceModules] | None = None
|
||||
"""It makes sense to set this only if `--otlp-traces-endpoint` is set. If
|
||||
set, it will collect detailed traces for the specified modules. This
|
||||
involves use of possibly costly and or blocking operations and hence might
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
from typing import TYPE_CHECKING, Any, Literal, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any, Literal
|
||||
|
||||
import torch
|
||||
from pydantic import Field, model_validator
|
||||
@@ -75,7 +75,7 @@ class ParallelConfig:
|
||||
"""Number of local data parallel groups."""
|
||||
data_parallel_rank: int = 0
|
||||
"""Rank of the data parallel group."""
|
||||
data_parallel_rank_local: Optional[int] = None
|
||||
data_parallel_rank_local: int | None = None
|
||||
"""Local rank of the data parallel group,
|
||||
set only in SPMD mode."""
|
||||
data_parallel_master_ip: str = "127.0.0.1"
|
||||
@@ -113,24 +113,24 @@ class ParallelConfig:
|
||||
with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1
|
||||
will have experts [1, 3]. This strategy can help improve load balancing
|
||||
for grouped expert models with no redundant experts."""
|
||||
num_redundant_experts: Optional[int] = None
|
||||
num_redundant_experts: int | None = None
|
||||
"""`num_redundant_experts` is deprecated and has been replaced with
|
||||
`eplb_config.num_redundant_experts`. This will be removed in v0.12.0.
|
||||
Please use `eplb_config.num_redundant_experts` instead."""
|
||||
eplb_window_size: Optional[int] = None
|
||||
eplb_window_size: int | None = None
|
||||
"""`eplb_window_size` is deprecated and has been replaced with
|
||||
`eplb_config.window_size`. This will be removed in v0.12.0.
|
||||
Please use `eplb_config.window_size` instead."""
|
||||
eplb_step_interval: Optional[int] = None
|
||||
eplb_step_interval: int | None = None
|
||||
"""`eplb_step_interval` is deprecated and has been replaced with
|
||||
`eplb_config.step_interval`. This will be removed in v0.12.0.
|
||||
Please use `eplb_config.step_interval` instead."""
|
||||
eplb_log_balancedness: Optional[bool] = None
|
||||
eplb_log_balancedness: bool | None = None
|
||||
"""`eplb_log_balancedness` is deprecated and has been replaced with
|
||||
`eplb_config.log_balancedness`. This will be removed in v0.12.0.
|
||||
Please use `eplb_config.log_balancedness` instead."""
|
||||
|
||||
max_parallel_loading_workers: Optional[int] = None
|
||||
max_parallel_loading_workers: int | None = None
|
||||
"""Maximum number of parallel loading workers when loading model
|
||||
sequentially in multiple batches. To avoid RAM OOM when using tensor
|
||||
parallel and large models."""
|
||||
@@ -159,15 +159,15 @@ class ParallelConfig:
|
||||
ray_workers_use_nsight: bool = False
|
||||
"""Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
|
||||
|
||||
ray_runtime_env: Optional[RuntimeEnv] = None
|
||||
ray_runtime_env: RuntimeEnv | None = None
|
||||
"""Ray runtime environment to pass to distributed workers."""
|
||||
|
||||
placement_group: Optional[PlacementGroup] = None
|
||||
placement_group: PlacementGroup | None = None
|
||||
"""ray distributed model workers placement group."""
|
||||
|
||||
distributed_executor_backend: Optional[
|
||||
Union[str, DistributedExecutorBackend, type[ExecutorBase]]
|
||||
] = None
|
||||
distributed_executor_backend: (
|
||||
str | DistributedExecutorBackend | type[ExecutorBase] | None
|
||||
) = None
|
||||
"""Backend to use for distributed model
|
||||
workers, either "ray" or "mp" (multiprocessing). If the product
|
||||
of pipeline_parallel_size and tensor_parallel_size is less than
|
||||
@@ -306,7 +306,7 @@ class ParallelConfig:
|
||||
)
|
||||
|
||||
max_retries = 5
|
||||
last_exc: Optional[Exception] = None
|
||||
last_exc: Exception | None = None
|
||||
for _ in range(max_retries):
|
||||
try:
|
||||
# use gloo since the engine process might not have cuda device
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import hashlib
|
||||
from typing import Any, Optional
|
||||
from typing import Any
|
||||
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
@@ -14,23 +14,23 @@ from vllm.config.utils import config
|
||||
class PoolerConfig:
|
||||
"""Controls the behavior of output pooling in pooling models."""
|
||||
|
||||
pooling_type: Optional[str] = None
|
||||
pooling_type: str | None = None
|
||||
"""
|
||||
The pooling method of the pooling model. This should be a key in
|
||||
[`vllm.model_executor.layers.pooler.PoolingType`][].
|
||||
"""
|
||||
|
||||
## for embeddings models
|
||||
normalize: Optional[bool] = None
|
||||
normalize: bool | None = None
|
||||
"""
|
||||
Whether to normalize the embeddings outputs. Defaults to True.
|
||||
"""
|
||||
dimensions: Optional[int] = None
|
||||
dimensions: int | None = None
|
||||
"""
|
||||
Reduce the dimensions of embeddings if model
|
||||
support matryoshka representation. Defaults to None.
|
||||
"""
|
||||
enable_chunked_processing: Optional[bool] = None
|
||||
enable_chunked_processing: bool | None = None
|
||||
"""
|
||||
Whether to enable chunked processing for long inputs that exceed the model's
|
||||
maximum position embeddings. When enabled, long inputs will be split into
|
||||
@@ -38,7 +38,7 @@ class PoolerConfig:
|
||||
This allows embedding models to handle arbitrarily long text without CUDA
|
||||
errors. Defaults to False.
|
||||
"""
|
||||
max_embed_len: Optional[int] = None
|
||||
max_embed_len: int | None = None
|
||||
"""
|
||||
Maximum input length allowed for embedding generation. When set, allows
|
||||
inputs longer than max_embed_len to be accepted for embedding models.
|
||||
@@ -48,29 +48,29 @@ class PoolerConfig:
|
||||
"""
|
||||
|
||||
## for classification models
|
||||
activation: Optional[bool] = None
|
||||
activation: bool | None = None
|
||||
"""
|
||||
Whether to apply activation function to the classification outputs.
|
||||
Defaults to True.
|
||||
"""
|
||||
logit_bias: Optional[float] = None
|
||||
logit_bias: float | None = None
|
||||
"""
|
||||
If provided, apply classification logit biases. Defaults to None.
|
||||
"""
|
||||
|
||||
## for reward models
|
||||
softmax: Optional[bool] = None
|
||||
softmax: bool | None = None
|
||||
"""
|
||||
Whether to apply softmax to the reward outputs.
|
||||
Defaults to True.
|
||||
"""
|
||||
step_tag_id: Optional[int] = None
|
||||
step_tag_id: int | None = None
|
||||
"""
|
||||
If set, only the score corresponding to the ``step_tag_id`` in the
|
||||
generated sentence should be returned. Otherwise, the scores for all tokens
|
||||
are returned.
|
||||
"""
|
||||
returned_token_ids: Optional[list[int]] = None
|
||||
returned_token_ids: list[int] | None = None
|
||||
"""
|
||||
A list of indices for the vocabulary dimensions to be extracted,
|
||||
such as the token IDs of ``good_token`` and ``bad_token`` in the
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
import hashlib
|
||||
from dataclasses import InitVar, field
|
||||
from typing import Any, Literal, Union
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic import SkipValidation, model_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
@@ -133,7 +133,7 @@ class SchedulerConfig:
|
||||
|
||||
# scheduler class or path. "vllm.core.scheduler.Scheduler" (default)
|
||||
# or "mod.custom_class".
|
||||
scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler"
|
||||
scheduler_cls: str | type[object] = "vllm.core.scheduler.Scheduler"
|
||||
"""The scheduler class to use. "vllm.core.scheduler.Scheduler" is the
|
||||
default scheduler. Can be a class directly or the path to a class of form
|
||||
"mod.custom_class"."""
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
import ast
|
||||
import hashlib
|
||||
from typing import TYPE_CHECKING, Any, Literal, Optional
|
||||
from typing import TYPE_CHECKING, Any, Literal
|
||||
|
||||
from pydantic import SkipValidation, model_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
@@ -59,16 +59,16 @@ MTP_MODEL_TYPES = (
|
||||
class SpeculativeConfig:
|
||||
"""Configuration for speculative decoding."""
|
||||
|
||||
enforce_eager: Optional[bool] = None
|
||||
enforce_eager: bool | None = None
|
||||
"""Override the default enforce_eager from model_config"""
|
||||
# General speculative decoding control
|
||||
num_speculative_tokens: SkipValidation[int] = None # type: ignore
|
||||
"""The number of speculative tokens, if provided. It will default to the
|
||||
number in the draft model config if present, otherwise, it is required."""
|
||||
model: Optional[str] = None
|
||||
model: str | None = None
|
||||
"""The name of the draft model, eagle head, or additional weights, if
|
||||
provided."""
|
||||
method: Optional[SpeculativeMethod] = None
|
||||
method: SpeculativeMethod | None = None
|
||||
"""The name of the speculative method to use. If users provide and set the
|
||||
`model` param, the speculative method type will be detected automatically
|
||||
if possible, if `model` param is not provided, the method name must be
|
||||
@@ -76,7 +76,7 @@ class SpeculativeConfig:
|
||||
|
||||
If using `ngram` method, the related configuration `prompt_lookup_max` and
|
||||
`prompt_lookup_min` should be considered."""
|
||||
draft_tensor_parallel_size: Optional[int] = None
|
||||
draft_tensor_parallel_size: int | None = None
|
||||
"""The degree of the tensor parallelism for the draft model. Can only be 1
|
||||
or the same as the target model's tensor parallel size."""
|
||||
disable_logprobs: bool = True
|
||||
@@ -85,24 +85,24 @@ class SpeculativeConfig:
|
||||
according to the log probability settings in SamplingParams."""
|
||||
|
||||
# Draft model configuration
|
||||
quantization: Optional[me_quant.QuantizationMethods] = None
|
||||
quantization: me_quant.QuantizationMethods | None = None
|
||||
"""Quantization method that was used to quantize the draft model weights.
|
||||
If `None`, we assume the model weights are not quantized. Note that it only
|
||||
takes effect when using the draft model-based speculative method."""
|
||||
max_model_len: Optional[int] = None
|
||||
max_model_len: int | None = None
|
||||
"""The maximum model length of the draft model. Used when testing the
|
||||
ability to skip speculation for some sequences."""
|
||||
revision: Optional[str] = None
|
||||
revision: str | None = None
|
||||
"""The specific model version to use for the draft model. It can be a
|
||||
branch name, a tag name, or a commit id. If unspecified, will use the
|
||||
default version."""
|
||||
code_revision: Optional[str] = None
|
||||
code_revision: str | None = None
|
||||
"""The specific revision to use for the draft model code on Hugging Face
|
||||
Hub. It can be a branch name, a tag name, or a commit id. If unspecified,
|
||||
will use the default version."""
|
||||
|
||||
# Advanced control
|
||||
disable_by_batch_size: Optional[int] = None
|
||||
disable_by_batch_size: int | None = None
|
||||
"""Disable speculative decoding for new incoming requests when the number
|
||||
of enqueued requests is larger than this value, if provided."""
|
||||
disable_padded_drafter_batch: bool = False
|
||||
@@ -112,14 +112,14 @@ class SpeculativeConfig:
|
||||
only affects the EAGLE method of speculation."""
|
||||
|
||||
# Ngram proposer configuration
|
||||
prompt_lookup_max: Optional[int] = None
|
||||
prompt_lookup_max: int | None = None
|
||||
"""Maximum size of ngram token window when using Ngram proposer, required
|
||||
when method is set to ngram."""
|
||||
prompt_lookup_min: Optional[int] = None
|
||||
prompt_lookup_min: int | None = None
|
||||
"""Minimum size of ngram token window when using Ngram proposer, if
|
||||
provided. Defaults to 1."""
|
||||
|
||||
speculative_token_tree: Optional[str] = None
|
||||
speculative_token_tree: str | None = None
|
||||
"""Specifies the tree structure for speculative token generation.
|
||||
"""
|
||||
# required configuration params passed from engine
|
||||
@@ -449,7 +449,7 @@ class SpeculativeConfig:
|
||||
|
||||
@staticmethod
|
||||
def _maybe_override_draft_max_model_len(
|
||||
speculative_max_model_len: Optional[int],
|
||||
speculative_max_model_len: int | None,
|
||||
draft_max_model_len: int,
|
||||
target_max_model_len: int,
|
||||
) -> int:
|
||||
@@ -488,7 +488,7 @@ class SpeculativeConfig:
|
||||
@staticmethod
|
||||
def _verify_and_get_draft_tp(
|
||||
target_parallel_config: ParallelConfig,
|
||||
speculative_draft_tensor_parallel_size: Optional[int],
|
||||
speculative_draft_tensor_parallel_size: int | None,
|
||||
draft_hf_config: PretrainedConfig,
|
||||
) -> int:
|
||||
"""
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
@@ -28,7 +27,7 @@ class SpeechToTextConfig:
|
||||
splitting long audio. This helps maintain context across chunk boundaries
|
||||
and improves transcription quality at split points."""
|
||||
|
||||
min_energy_split_window_size: Optional[int] = 1600
|
||||
min_energy_split_window_size: int | None = 1600
|
||||
"""Window size in samples for finding low-energy (quiet) regions to split
|
||||
audio chunks. The algorithm looks for the quietest moment within this
|
||||
window to minimize cutting through speech. Default 1600 samples ≈ 100ms
|
||||
|
||||
@@ -10,7 +10,7 @@ from contextlib import contextmanager
|
||||
from dataclasses import field, replace
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
|
||||
from typing import TYPE_CHECKING, Any, TypeVar
|
||||
|
||||
import torch
|
||||
from pydantic import ConfigDict
|
||||
@@ -69,17 +69,17 @@ class VllmConfig:
|
||||
"""Device configuration."""
|
||||
load_config: LoadConfig = field(default_factory=LoadConfig)
|
||||
"""Load configuration."""
|
||||
lora_config: Optional[LoRAConfig] = None
|
||||
lora_config: LoRAConfig | None = None
|
||||
"""LoRA configuration."""
|
||||
speculative_config: Optional[SpeculativeConfig] = None
|
||||
speculative_config: SpeculativeConfig | None = None
|
||||
"""Speculative decoding configuration."""
|
||||
structured_outputs_config: StructuredOutputsConfig = field(
|
||||
default_factory=StructuredOutputsConfig
|
||||
)
|
||||
"""Structured outputs configuration."""
|
||||
observability_config: Optional[ObservabilityConfig] = None
|
||||
observability_config: ObservabilityConfig | None = None
|
||||
"""Observability configuration."""
|
||||
quant_config: Optional[QuantizationConfig] = None
|
||||
quant_config: QuantizationConfig | None = None
|
||||
"""Quantization configuration."""
|
||||
compilation_config: CompilationConfig = field(default_factory=CompilationConfig)
|
||||
"""`torch.compile` and cudagraph capture configuration for the model.
|
||||
@@ -96,14 +96,14 @@ class VllmConfig:
|
||||
You can specify the full compilation config like so:
|
||||
`{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
|
||||
"""
|
||||
kv_transfer_config: Optional[KVTransferConfig] = None
|
||||
kv_transfer_config: KVTransferConfig | None = None
|
||||
"""The configurations for distributed KV cache transfer."""
|
||||
kv_events_config: Optional[KVEventsConfig] = None
|
||||
kv_events_config: KVEventsConfig | None = None
|
||||
"""The configurations for event publishing."""
|
||||
# some opaque config, only used to provide additional information
|
||||
# for the hash computation, mainly used for testing, debugging or out of
|
||||
# tree config registration.
|
||||
additional_config: Union[dict, SupportsHash] = field(default_factory=dict)
|
||||
additional_config: dict | SupportsHash = field(default_factory=dict)
|
||||
"""Additional config for specified platform. Different platforms may
|
||||
support different configs. Make sure the configs are valid for the platform
|
||||
you are using. Contents must be hashable."""
|
||||
@@ -212,7 +212,7 @@ class VllmConfig:
|
||||
@staticmethod
|
||||
def _get_quantization_config(
|
||||
model_config: ModelConfig, load_config: LoadConfig
|
||||
) -> Optional[QuantizationConfig]:
|
||||
) -> QuantizationConfig | None:
|
||||
"""Get the quantization config."""
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
@@ -245,7 +245,7 @@ class VllmConfig:
|
||||
@staticmethod
|
||||
def get_quantization_config(
|
||||
model_config: ModelConfig, load_config: LoadConfig
|
||||
) -> Optional[QuantizationConfig]:
|
||||
) -> QuantizationConfig | None:
|
||||
import copy
|
||||
|
||||
# For some reason, the _ version of this modifies the model_config
|
||||
@@ -257,7 +257,7 @@ class VllmConfig:
|
||||
def with_hf_config(
|
||||
self,
|
||||
hf_config: PretrainedConfig,
|
||||
architectures: Optional[list[str]] = None,
|
||||
architectures: list[str] | None = None,
|
||||
) -> "VllmConfig":
|
||||
if architectures is not None:
|
||||
hf_config = copy.deepcopy(hf_config)
|
||||
@@ -740,7 +740,7 @@ class VllmConfig:
|
||||
f"Model: {self.model_config.model}"
|
||||
)
|
||||
|
||||
def compile_debug_dump_path(self) -> Optional[Path]:
|
||||
def compile_debug_dump_path(self) -> Path | None:
|
||||
"""Returns a rank-aware path for dumping
|
||||
torch.compile debug information.
|
||||
"""
|
||||
@@ -790,13 +790,13 @@ class VllmConfig:
|
||||
)
|
||||
|
||||
|
||||
_current_vllm_config: Optional[VllmConfig] = None
|
||||
_current_prefix: Optional[str] = None
|
||||
_current_vllm_config: VllmConfig | None = None
|
||||
_current_prefix: str | None = None
|
||||
|
||||
|
||||
@contextmanager
|
||||
def set_current_vllm_config(
|
||||
vllm_config: VllmConfig, check_compile=False, prefix: Optional[str] = None
|
||||
vllm_config: VllmConfig, check_compile=False, prefix: str | None = None
|
||||
):
|
||||
"""
|
||||
Temporarily set the current vLLM config.
|
||||
@@ -866,7 +866,7 @@ T = TypeVar("T")
|
||||
def get_layers_from_vllm_config(
|
||||
vllm_config: VllmConfig,
|
||||
layer_type: type[T],
|
||||
layer_names: Optional[list[str]] = None,
|
||||
layer_names: list[str] | None = None,
|
||||
) -> dict[str, T]:
|
||||
"""
|
||||
Get layers from the vLLM config.
|
||||
|
||||
Reference in New Issue
Block a user