MoveVllmConfig from config/__init__.py to config/vllm.py (#25271)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-09-30 03:49:49 +01:00
committed by GitHub
parent d3bd171123
commit 61aedb5ffe
36 changed files with 964 additions and 905 deletions

View File

@@ -1,29 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: F401
import ast
import copy
import hashlib
import inspect
import json
import os
import textwrap
from contextlib import contextmanager
from dataclasses import field, fields, is_dataclass, replace
from functools import cached_property, lru_cache
from pathlib import Path
from typing import (TYPE_CHECKING, Any, Literal, Optional, Protocol, TypeVar,
Union, cast)
import regex as re
import torch
from pydantic import ConfigDict, SkipValidation
from pydantic.dataclasses import dataclass
from typing_extensions import runtime_checkable
import vllm.envs as envs
from vllm import version
from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, MambaDType,
PrefixCachingHashAlgo)
from vllm.config.compilation import (CompilationConfig, CompilationLevel,
@@ -48,806 +25,82 @@ from vllm.config.scheduler import RunnerType, SchedulerConfig, SchedulerPolicy
from vllm.config.speculative import SpeculativeConfig
from vllm.config.speech_to_text import SpeechToTextConfig
from vllm.config.structured_outputs import StructuredOutputsConfig
from vllm.config.utils import ConfigType, config, get_attr_docs, is_init_field
from vllm.logger import init_logger
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.transformers_utils.runai_utils import is_runai_obj_uri
from vllm.utils import random_uuid
if TYPE_CHECKING:
from _typeshed import DataclassInstance
from transformers.configuration_utils import PretrainedConfig
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
else:
DataclassInstance = Any
PretrainedConfig = Any
QuantizationConfig = Any
QuantizationMethods = Any
BaseModelLoader = Any
LogitsProcessor = Any
logger = init_logger(__name__)
DataclassInstanceT = TypeVar("DataclassInstanceT", bound=DataclassInstance)
@runtime_checkable
class SupportsHash(Protocol):
def compute_hash(self) -> str:
...
class SupportsMetricsInfo(Protocol):
def metrics_info(self) -> dict[str, str]:
...
@config
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class VllmConfig:
"""Dataclass which contains all vllm-related configuration. This
simplifies passing around the distinct configurations in the codebase.
"""
# TODO: use default_factory once default constructing ModelConfig doesn't
# try to download a model
model_config: ModelConfig = None # type: ignore
"""Model configuration."""
cache_config: CacheConfig = field(default_factory=CacheConfig)
"""Cache configuration."""
parallel_config: ParallelConfig = field(default_factory=ParallelConfig)
"""Parallel configuration."""
scheduler_config: SchedulerConfig = field(default_factory=SchedulerConfig)
"""Scheduler configuration."""
device_config: DeviceConfig = field(default_factory=DeviceConfig)
"""Device configuration."""
load_config: LoadConfig = field(default_factory=LoadConfig)
"""Load configuration."""
lora_config: Optional[LoRAConfig] = None
"""LoRA configuration."""
speculative_config: Optional[SpeculativeConfig] = None
"""Speculative decoding configuration."""
structured_outputs_config: StructuredOutputsConfig = field(
default_factory=StructuredOutputsConfig)
"""Structured outputs configuration."""
observability_config: Optional[ObservabilityConfig] = None
"""Observability configuration."""
quant_config: Optional[QuantizationConfig] = None
"""Quantization configuration."""
compilation_config: CompilationConfig = field(
default_factory=CompilationConfig)
"""`torch.compile` and cudagraph capture configuration for the model.
As a shorthand, `-O<n>` can be used to directly specify the compilation
level `n`: `-O3` is equivalent to `-O.level=3` (same as `-O='{"level":3}'`).
Currently, -O <n> and -O=<n> are supported as well but this will likely be
removed in favor of clearer -O<n> syntax in the future.
NOTE: level 0 is the default level without any optimization. level 1 and 2
are for internal testing only. level 3 is the recommended level for
production, also default in V1.
You can specify the full compilation config like so:
`{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
"""
kv_transfer_config: Optional[KVTransferConfig] = None
"""The configurations for distributed KV cache transfer."""
kv_events_config: Optional[KVEventsConfig] = None
"""The configurations for event publishing."""
# some opaque config, only used to provide additional information
# for the hash computation, mainly used for testing, debugging or out of
# tree config registration.
additional_config: Union[dict, SupportsHash] = field(default_factory=dict)
"""Additional config for specified platform. Different platforms may
support different configs. Make sure the configs are valid for the platform
you are using. Contents must be hashable."""
instance_id: str = ""
"""The ID of the vLLM instance."""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
factors: list[Any] = []
# summarize vllm config
vllm_factors: list[Any] = []
from vllm import __version__
vllm_factors.append(__version__)
vllm_factors.append(envs.VLLM_USE_V1)
if self.model_config:
vllm_factors.append(self.model_config.compute_hash())
else:
vllm_factors.append("None")
if self.cache_config:
vllm_factors.append(self.cache_config.compute_hash())
else:
vllm_factors.append("None")
if self.parallel_config:
vllm_factors.append(self.parallel_config.compute_hash())
else:
vllm_factors.append("None")
if self.scheduler_config:
vllm_factors.append(self.scheduler_config.compute_hash())
else:
vllm_factors.append("None")
if self.device_config:
vllm_factors.append(self.device_config.compute_hash())
else:
vllm_factors.append("None")
if self.load_config:
vllm_factors.append(self.load_config.compute_hash())
else:
vllm_factors.append("None")
if self.lora_config:
vllm_factors.append(self.lora_config.compute_hash())
# LoRA creates static buffers based on max_num_batched_tokens.
# The tensor sizes and strides get captured in the torch.compile
# graph explicitly.
vllm_factors.append(
str(self.scheduler_config.max_num_batched_tokens))
else:
vllm_factors.append("None")
if self.speculative_config:
vllm_factors.append(self.speculative_config.compute_hash())
else:
vllm_factors.append("None")
if self.structured_outputs_config:
vllm_factors.append(self.structured_outputs_config.compute_hash())
else:
vllm_factors.append("None")
if self.observability_config:
vllm_factors.append(self.observability_config.compute_hash())
else:
vllm_factors.append("None")
if self.quant_config:
pass # should be captured by model_config.quantization
if self.compilation_config:
vllm_factors.append(self.compilation_config.compute_hash())
else:
vllm_factors.append("None")
if self.kv_transfer_config:
vllm_factors.append(self.kv_transfer_config.compute_hash())
else:
vllm_factors.append("None")
if self.additional_config:
if isinstance(additional_config := self.additional_config, dict):
additional_config_hash = hashlib.md5(
json.dumps(additional_config, sort_keys=True).encode(),
usedforsecurity=False,
).hexdigest()
else:
additional_config_hash = additional_config.compute_hash()
vllm_factors.append(additional_config_hash)
else:
vllm_factors.append("None")
factors.append(vllm_factors)
hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()[:10]
return hash_str
def pad_for_cudagraph(self, batch_size: int) -> int:
# if batch_size > self.compilation_config.max_capture_size,
# it should raise an IndexError.
# the caller should make sure the batch_size is within the range,
# i.e., batch_size <= self.compilation_config.max_capture_size
return self.compilation_config.bs_to_padded_graph_size[batch_size]
@staticmethod
def _get_quantization_config(
model_config: ModelConfig,
load_config: LoadConfig) -> Optional[QuantizationConfig]:
"""Get the quantization config."""
from vllm.platforms import current_platform
if model_config.quantization is not None:
from vllm.model_executor.model_loader.weight_utils import (
get_quant_config)
quant_config = get_quant_config(model_config, load_config)
capability_tuple = current_platform.get_device_capability()
if capability_tuple is not None:
capability = capability_tuple.to_int()
if capability < quant_config.get_min_capability():
raise ValueError(
f"The quantization method {model_config.quantization} "
"is not supported for the current GPU. Minimum "
f"capability: {quant_config.get_min_capability()}. "
f"Current capability: {capability}.")
supported_dtypes = quant_config.get_supported_act_dtypes()
if model_config.dtype not in supported_dtypes:
raise ValueError(
f"{model_config.dtype} is not supported for quantization "
f"method {model_config.quantization}. Supported dtypes: "
f"{supported_dtypes}")
quant_config.maybe_update_config(model_config.model)
return quant_config
return None
@staticmethod
def get_quantization_config(
model_config: ModelConfig,
load_config: LoadConfig) -> Optional[QuantizationConfig]:
import copy
# For some reason, the _ version of this modifies the model_config
# object, so using deepcopy to avoid this problem.
return VllmConfig._get_quantization_config(copy.deepcopy(model_config),
load_config)
def with_hf_config(
self,
hf_config: PretrainedConfig,
architectures: Optional[list[str]] = None,
) -> "VllmConfig":
if architectures is not None:
hf_config = copy.deepcopy(hf_config)
hf_config.architectures = architectures
model_config = copy.deepcopy(self.model_config)
model_config.hf_config = hf_config
return replace(self, model_config=model_config)
def __post_init__(self):
"""Verify configs are valid & consistent with each other.
"""
self.try_verify_and_update_config()
if self.model_config is not None:
self.model_config.verify_with_parallel_config(self.parallel_config)
self.model_config.verify_dual_chunk_attention_config(
self.load_config)
self.cache_config.verify_with_parallel_config(self.parallel_config)
if self.lora_config is not None:
self.lora_config.verify_with_cache_config(self.cache_config)
self.lora_config.verify_with_model_config(self.model_config)
if self.quant_config is None and self.model_config is not None:
self.quant_config = VllmConfig._get_quantization_config(
self.model_config, self.load_config)
from vllm.platforms import current_platform
if self.model_config is not None and \
self.scheduler_config.chunked_prefill_enabled and \
self.model_config.dtype == torch.float32 and \
current_platform.get_device_capability() == (7, 5):
logger.warning_once(
"Turing devices tensor cores do not support float32 matmul. "
"To workaround this limitation, vLLM will set 'ieee' input "
"precision for chunked prefill triton kernels.")
# If the user does not explicitly set a compilation level, then
# we use the default level. The default level depends on other
# settings (see the below code).
if self.compilation_config.level is None:
if envs.VLLM_USE_V1:
if (self.model_config is not None
and not self.model_config.enforce_eager):
self.compilation_config.level = CompilationLevel.PIECEWISE
else:
self.compilation_config.level = \
CompilationLevel.NO_COMPILATION
else:
# NB: Passing both --enforce-eager and a compilation level
# in V0 means the compilation level wins out.
self.compilation_config.level = CompilationLevel.NO_COMPILATION
# async tp is built on top of sequence parallelism
# and requires it to be enabled.
if self.compilation_config.pass_config.enable_async_tp:
self.compilation_config.pass_config.enable_sequence_parallelism = \
True
if self.compilation_config.pass_config.enable_sequence_parallelism:
self.compilation_config.custom_ops.append("+rms_norm")
if current_platform.support_static_graph_mode():
# if cudagraph_mode is not explicitly set by users, set default
# value
if self.compilation_config.cudagraph_mode is None:
if envs.VLLM_USE_V1 and self.compilation_config.level \
== CompilationLevel.PIECEWISE:
# default to full and piecewise for most models
self.compilation_config.cudagraph_mode = \
CUDAGraphMode.FULL_AND_PIECEWISE
# pooling models and encoder-decoder models
# do not support full cudagraphs
if self.model_config is not None and \
(self.model_config.pooler_config is not None
or self.model_config.is_encoder_decoder):
self.compilation_config.cudagraph_mode = \
CUDAGraphMode.PIECEWISE
else:
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
# disable cudagraph when enforce eager execution
if self.model_config is not None and \
self.model_config.enforce_eager:
logger.info("Cudagraph is disabled under eager mode")
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
elif envs.VLLM_USE_V1:
self.compilation_config.cudagraph_num_of_warmups = 1
self._set_cudagraph_sizes()
else:
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
if self.cache_config.kv_sharing_fast_prefill:
if self.speculative_config is not None and \
self.speculative_config.use_eagle():
raise NotImplementedError(
"Fast prefill optimization for KV sharing is not "
"compatible with EAGLE as EAGLE requires correct logits "
"for all tokens while fast prefill gives incorrect logits "
"for prompt tokens.")
logger.warning_once(
"--kv-sharing-fast-prefill requires changes on model side for "
"correctness and to realize prefill savings. ")
disable_chunked_prefill_reasons: list[str] = []
if self.model_config:
if self.model_config.pooler_config:
pooling_type = self.model_config.pooler_config.pooling_type
if pooling_type is None or pooling_type.lower() != "last":
disable_chunked_prefill_reasons.append(
"Only \"last\" pooling supports chunked "
"prefill and prefix caching; disabling both.")
if not getattr(self.model_config.hf_config, "is_causal", True):
disable_chunked_prefill_reasons.append(
"Only models using causal attention supports chunked "
"prefill and prefix caching; disabling both.")
elif self.model_config.is_encoder_decoder:
self.scheduler_config.max_num_encoder_input_tokens = \
MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
logger.debug(
"Encoder-decoder model detected: setting "
"`max_num_encoder_input_tokens` to encoder length (%s)",
self.scheduler_config.max_num_encoder_input_tokens)
self.scheduler_config.disable_chunked_mm_input = True
disable_chunked_prefill_reasons.append(
"Encoder-decoder models do not support chunked prefill nor"
" prefix caching; disabling both.")
if (self.model_config.architecture
== "WhisperForConditionalGeneration"
and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD")
!= "spawn"):
logger.warning(
"Whisper is known to have issues with "
"forked workers. If startup is hanging, "
"try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
"to 'spawn'.")
if disable_chunked_prefill_reasons:
for reason in disable_chunked_prefill_reasons:
logger.info(reason)
self.scheduler_config.chunked_prefill_enabled = False
self.scheduler_config.long_prefill_token_threshold = 0
if self.cache_config is not None:
self.cache_config.enable_prefix_caching = False
if (self.kv_events_config is not None
and self.kv_events_config.enable_kv_cache_events
and not self.cache_config.enable_prefix_caching):
logger.warning(
"KV cache events are on, but prefix caching is not enabled."
"Use --enable-prefix-caching to enable.")
if (self.kv_events_config is not None
and self.kv_events_config.publisher != "null"
and not self.kv_events_config.enable_kv_cache_events):
logger.warning("KV cache events are disabled,"
"but the scheduler is configured to publish them."
"Modify KVEventsConfig.enable_kv_cache_events"
"to True to enable.")
current_platform.check_and_update_config(self)
# Do this after all the updates to compilation_config.level
if envs.VLLM_USE_V1 and \
self.compilation_config.level == CompilationLevel.PIECEWISE:
self.compilation_config.set_splitting_ops_for_v1()
# final check of cudagraph mode after all possible updates
if envs.VLLM_USE_V1 and current_platform.is_cuda_alike():
if self.compilation_config.cudagraph_mode.has_full_cudagraphs()\
and self.model_config is not None and \
not self.model_config.disable_cascade_attn and\
not self.compilation_config.cudagraph_mode.\
has_piecewise_cudagraphs():
logger.warning_once(
"No piecewise cudagraph for executing cascade attention."
" Will fall back to eager execution if a batch runs "
"into cascade attentions")
if self.compilation_config.cudagraph_mode\
.requires_piecewise_compilation():
assert self.compilation_config.level == \
CompilationLevel.PIECEWISE, \
"Compilation level should be CompilationLevel.PIECEWISE "\
"when cudagraph_mode piecewise cudagraphs is used, "\
f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
# final migrate the deprecated flags
self.compilation_config.use_cudagraph = self.compilation_config.\
cudagraph_mode!= CUDAGraphMode.NONE
self.compilation_config.full_cuda_graph = self.compilation_config.\
cudagraph_mode.has_full_cudagraphs()
if self.parallel_config.enable_dbo:
a2a_backend = envs.VLLM_ALL2ALL_BACKEND
assert a2a_backend in \
["deepep_low_latency", "deepep_high_throughput"], \
"Microbatching currently only supports the deepep_low_latency and "\
f"deepep_high_throughput all2all backend. {a2a_backend} is not "\
"supported. To fix set the VLLM_ALL2ALL_BACKEND environment "\
"variable to deepep_low_latency or deepep_high_throughput and "\
"install the DeepEP kernels."
if not self.model_config.disable_cascade_attn:
self.model_config.disable_cascade_attn = True
logger.warning_once(
"Disabling cascade attention when DBO is enabled.")
if not self.instance_id:
self.instance_id = random_uuid()[:5]
if (envs.VLLM_USE_V1
and not self.scheduler_config.disable_hybrid_kv_cache_manager):
# logger should only print warning message for hybrid models. As we
# can't know whether the model is hybrid or not now, so we don't log
# warning message here and will log it later.
if not current_platform.support_hybrid_kv_cache():
# Hybrid KV cache manager is not supported on non-GPU platforms.
self.scheduler_config.disable_hybrid_kv_cache_manager = True
if self.kv_transfer_config is not None:
# Hybrid KV cache manager is not compatible with KV transfer.
self.scheduler_config.disable_hybrid_kv_cache_manager = True
if self.kv_events_config is not None:
# Hybrid KV cache manager is not compatible with KV events.
self.scheduler_config.disable_hybrid_kv_cache_manager = True
if self.model_config is not None and \
self.model_config.attention_chunk_size is not None:
if self.speculative_config is not None and \
self.speculative_config.use_eagle():
# Hybrid KV cache manager is not yet supported with chunked
# local attention + eagle.
self.scheduler_config.disable_hybrid_kv_cache_manager = True
elif \
not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE:
logger.warning(
"There is a latency regression when using chunked local"
" attention with the hybrid KV cache manager. Disabling"
" it, by default. To enable it, set the environment "
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1."
)
# Hybrid KV cache manager is not yet supported with chunked
# local attention.
self.scheduler_config.disable_hybrid_kv_cache_manager = True
if self.compilation_config.debug_dump_path:
self.compilation_config.debug_dump_path = \
self.compilation_config.debug_dump_path.absolute().expanduser()
if envs.VLLM_DEBUG_DUMP_PATH is not None:
env_path = Path(envs.VLLM_DEBUG_DUMP_PATH).absolute().expanduser()
if self.compilation_config.debug_dump_path:
logger.warning(
"Config-specified debug dump path is overridden"
" by VLLM_DEBUG_DUMP_PATH to %s", env_path)
self.compilation_config.debug_dump_path = env_path
def update_sizes_for_sequence_parallelism(self,
possible_sizes: list) -> list:
# remove the sizes that not multiple of tp_size when
# enable sequence parallelism
removed_sizes = [
size for size in possible_sizes
if size % self.parallel_config.tensor_parallel_size != 0
]
if removed_sizes:
logger.warning(
"Batch sizes %s are removed because they are not "
"multiple of tp_size %d when "
"sequence parallelism is enabled", removed_sizes,
self.parallel_config.tensor_parallel_size)
return [
size for size in possible_sizes
if size % self.parallel_config.tensor_parallel_size == 0
]
def _set_cudagraph_sizes(self):
"""
vLLM defines the default candidate list of batch sizes for CUDA graph
capture as:
```python
max_graph_size = min(max_num_seqs * 2, 512)
# 1, 2, 4, then multiples of 8 up to max_graph_size
cuda_graph_sizes = [1, 2, 4, 8, 16, 24, 32, 40, ..., max_graph_size]
In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
will be the final sizes to capture cudagraph (in descending order).
These sizes are used to capture and reuse CUDA graphs for
performance-critical paths (e.g., decoding). Capturing enables
significantly faster kernel dispatch by avoiding Python overhead. The
list is then filtered based on `max_num_batched_tokens` (e.g., 8192 on
most GPUs), which controls the total allowed number of tokens in a
batch. Since each sequence may have a variable number of tokens, the
maximum usable batch size will depend on actual sequence lengths.
Example:
With `max_num_batched_tokens = 8192`, and typical sequences
averaging ~32 tokens, most practical batch sizes fall below 256.
However, the system will still allow capture sizes up to 512 if
shape and memory permit.
Note:
If users explicitly specify cudagraph capture sizes in the
compilation config, those will override this default logic.
At runtime:
- If batch size <= one of the `cudagraph_capture_sizes`, the closest
padded CUDA graph will be used.
- If batch size > largest `cudagraph_capture_sizes`, cudagraph will
not be used.
"""
# calculate the default `batch_size_capture_list`
batch_size_capture_list = []
if self.model_config is not None and \
not self.model_config.enforce_eager:
cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes
if len(cuda_graph_sizes) == 1:
batch_size_capture_list = [1, 2, 4] + [
i for i in range(8, cuda_graph_sizes[0] + 1, 8)
]
elif len(cuda_graph_sizes) > 1:
batch_size_capture_list = sorted(cuda_graph_sizes)
else:
raise TypeError(f"Invalid value for {cuda_graph_sizes=}.")
if self.parallel_config.tensor_parallel_size > 1 and \
self.compilation_config.pass_config.enable_sequence_parallelism:
batch_size_capture_list = \
self.update_sizes_for_sequence_parallelism(batch_size_capture_list)
max_num_tokens = self.scheduler_config.max_num_batched_tokens
batch_size_capture_list = [
size for size in batch_size_capture_list
if size <= max_num_tokens
]
self.compilation_config.init_with_cudagraph_sizes(
batch_size_capture_list)
def recalculate_max_model_len(self, max_model_len: int):
# Can only be called in try_verify_and_update_config
model_config = self.model_config
max_model_len = model_config.get_and_verify_max_len(max_model_len)
self.model_config.max_model_len = max_model_len
self.scheduler_config.max_model_len = max_model_len
def try_verify_and_update_config(self):
if self.model_config is None:
return
# Avoid running try_verify_and_update_config multiple times
if getattr(self.model_config, "config_updated", False):
return
self.model_config.config_updated = True
architecture = self.model_config.architecture
if architecture is None:
return
from vllm.model_executor.models.config import (
MODELS_CONFIG_MAP, HybridAttentionMambaModelConfig)
cls = MODELS_CONFIG_MAP.get(architecture, None)
if cls is not None:
cls.verify_and_update_config(self)
if self.model_config.is_hybrid:
HybridAttentionMambaModelConfig.verify_and_update_config(self)
if self.model_config.convert_type == "classify":
# Maybe convert ForCausalLM into ForSequenceClassification model.
from vllm.model_executor.models.adapters import (
SequenceClassificationConfig)
SequenceClassificationConfig.verify_and_update_config(self)
if hasattr(self.model_config, "model_weights") and is_runai_obj_uri(
self.model_config.model_weights):
if self.load_config.load_format == "auto":
logger.info("Detected Run:ai model config. "
"Overriding `load_format` to 'runai_streamer'")
self.load_config.load_format = "runai_streamer"
elif self.load_config.load_format != "runai_streamer":
raise ValueError(f"To load a model from S3, 'load_format' "
f"must be 'runai_streamer', "
f"but got '{self.load_config.load_format}'. "
f"Model: {self.model_config.model}")
def compile_debug_dump_path(self) -> Optional[Path]:
"""Returns a rank-aware path for dumping
torch.compile debug information.
"""
if self.compilation_config.debug_dump_path is None:
return None
tp_rank = self.parallel_config.rank
dp_rank = self.parallel_config.data_parallel_rank
data_parallel_size = self.parallel_config.data_parallel_size
append_path = f"rank_{tp_rank}" if data_parallel_size == 1 \
else f"rank_{tp_rank}_dp_{dp_rank}"
path = self.compilation_config.debug_dump_path / append_path
return path
def __str__(self):
return (
f"model={self.model_config.model!r}, "
f"speculative_config={self.speculative_config!r}, "
f"tokenizer={self.model_config.tokenizer!r}, "
f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
f"tokenizer_mode={self.model_config.tokenizer_mode}, "
f"revision={self.model_config.revision}, "
f"tokenizer_revision={self.model_config.tokenizer_revision}, "
f"trust_remote_code={self.model_config.trust_remote_code}, "
f"dtype={self.model_config.dtype}, "
f"max_seq_len={self.model_config.max_model_len}, "
f"download_dir={self.load_config.download_dir!r}, "
f"load_format={self.load_config.load_format}, "
f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}, " # noqa
f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, " # noqa
f"data_parallel_size={self.parallel_config.data_parallel_size}, " # noqa
f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, " # noqa
f"quantization={self.model_config.quantization}, "
f"enforce_eager={self.model_config.enforce_eager}, "
f"kv_cache_dtype={self.cache_config.cache_dtype}, "
f"device_config={self.device_config.device}, "
f"structured_outputs_config={self.structured_outputs_config!r}, "
f"observability_config={self.observability_config!r}, "
f"seed={self.model_config.seed}, "
f"served_model_name={self.model_config.served_model_name}, "
f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, " # noqa
f"pooler_config={self.model_config.pooler_config!r}, "
f"compilation_config={self.compilation_config!r}")
_current_vllm_config: Optional[VllmConfig] = None
_current_prefix: Optional[str] = None
@contextmanager
def set_current_vllm_config(vllm_config: VllmConfig,
check_compile=False,
prefix: Optional[str] = None):
"""
Temporarily set the current vLLM config.
Used during model initialization.
We save the current vLLM config in a global variable,
so that all modules can access it, e.g. custom ops
can access the vLLM config to determine how to dispatch.
"""
global _current_vllm_config, _current_prefix
old_vllm_config = _current_vllm_config
old_prefix = _current_prefix
from vllm.compilation.counter import compilation_counter
num_models_seen = compilation_counter.num_models_seen
try:
_current_vllm_config = vllm_config
_current_prefix = prefix
yield
except Exception:
raise
else:
if check_compile:
vllm_config.compilation_config.custom_op_log_check()
if check_compile and \
vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \
and compilation_counter.num_models_seen == num_models_seen:
# If the model supports compilation,
# compilation_counter.num_models_seen should be increased
# by at least 1.
# If it is not increased, it means the model does not support
# compilation (does not have @support_torch_compile decorator).
logger.warning(
"`torch.compile` is turned on, but the model %s"
" does not support it. Please open an issue on GitHub"
" if you want it to be supported.",
vllm_config.model_config.model)
finally:
_current_vllm_config = old_vllm_config
_current_prefix = old_prefix
# Clear the compilation config cache when context changes
get_cached_compilation_config.cache_clear()
@lru_cache(maxsize=1)
def get_cached_compilation_config():
"""Cache config to avoid repeated calls to get_current_vllm_config()"""
return get_current_vllm_config().compilation_config
def get_current_vllm_config() -> VllmConfig:
if _current_vllm_config is None:
# in ci, usually when we test custom ops/modules directly,
# we don't set the vllm config. In that case, we set a default
# config.
logger.warning("Current vLLM config is not set.")
from vllm.config import VllmConfig
return VllmConfig()
return _current_vllm_config
def get_current_model_prefix() -> str:
"""
Get the prefix of the model that's currently being initialized.
"""
assert _current_prefix is not None, \
"Current model prefix is not set. "
return _current_prefix
T = TypeVar("T")
def get_layers_from_vllm_config(
vllm_config: VllmConfig,
layer_type: type[T],
layer_names: Optional[list[str]] = None) -> dict[str, T]:
"""
Get layers from the vLLM config.
Args:
vllm_config: The vLLM config.
layer_type: The type of the layer to get.
layer_names: The names of the layers to get. If None, return all layers.
"""
if layer_names is None:
layer_names = list(
vllm_config.compilation_config.static_forward_context.keys())
forward_context = vllm_config.compilation_config.static_forward_context
return {
layer_name: forward_context[layer_name]
for layer_name in layer_names
if isinstance(forward_context[layer_name], layer_type)
}
def update_config(config: DataclassInstanceT,
overrides: dict[str, Any]) -> DataclassInstanceT:
processed_overrides = {}
for field_name, value in overrides.items():
assert hasattr(
config, field_name), f"{type(config)} has no field `{field_name}`"
current_value = getattr(config, field_name)
if is_dataclass(current_value) and not is_dataclass(value):
assert isinstance(value, dict), (
f"Overrides to {type(config)}.{field_name} must be a dict"
f" or {type(current_value)}, but got {type(value)}")
value = update_config(
current_value, # type: ignore[type-var]
value)
processed_overrides[field_name] = value
return replace(config, **processed_overrides)
from vllm.config.utils import (ConfigType, SupportsMetricsInfo, config,
get_attr_docs, is_init_field, update_config)
from vllm.config.vllm import (VllmConfig, get_cached_compilation_config,
get_current_vllm_config,
get_layers_from_vllm_config,
set_current_vllm_config)
__all__ = [
# From vllm.config.cache
"BlockSize",
"CacheConfig",
"CacheDType",
"MambaDType",
"PrefixCachingHashAlgo",
# From vllm.config.compilation
"CompilationConfig",
"CompilationLevel",
"CUDAGraphMode",
"PassConfig",
# From vllm.config.device
"Device",
"DeviceConfig",
# From vllm.config.kv_events
"KVEventsConfig",
# From vllm.config.kv_transfer
"KVTransferConfig",
# From vllm.config.load
"LoadConfig",
# From vllm.config.lora
"LoRAConfig",
# From vllm.config.model
"ConvertOption",
"HfOverrides",
"LogprobsMode",
"ModelConfig",
"ModelDType",
"ModelImpl",
"RunnerOption",
"TaskOption",
"TokenizerMode",
"iter_architecture_defaults",
"try_match_architecture_defaults",
# From vllm.config.multimodal
"MMCacheType",
"MMEncoderTPMode",
"MultiModalConfig",
# From vllm.config.observability
"DetailedTraceModules",
"ObservabilityConfig",
# From vllm.config.parallel
"DistributedExecutorBackend",
"EPLBConfig",
"ParallelConfig",
# From vllm.config.pooler
"PoolerConfig",
# From vllm.config.scheduler
"RunnerType",
"SchedulerConfig",
"SchedulerPolicy",
# From vllm.config.speculative
"SpeculativeConfig",
# From vllm.config.speech_to_text
"SpeechToTextConfig",
# From vllm.config.structured_outputs
"StructuredOutputsConfig",
# From vllm.config.utils
"ConfigType",
"SupportsMetricsInfo",
"config",
"get_attr_docs",
"is_init_field",
"update_config",
# From vllm.config.vllm
"VllmConfig",
"get_cached_compilation_config",
"get_current_vllm_config",
"set_current_vllm_config",
"get_layers_from_vllm_config",
]