Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: Xinyu Chen <xinyu1.chen@intel.com> Signed-off-by: chzhang <chaojun.zhang@intel.com> Signed-off-by: Luka Govedic <luka.govedic@gmail.com> Co-authored-by: Xinyu Chen <xinyu1.chen@intel.com> Co-authored-by: Chaojun Zhang <chaojun.zhang@intel.com> Co-authored-by: Luka Govedič <ProExpertProg@h100-01.nemg-001.lab.rdu2.dc.redhat.com>
1903 lines
82 KiB
Python
1903 lines
82 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
import copy
|
|
import getpass
|
|
import json
|
|
import os
|
|
import tempfile
|
|
import threading
|
|
import time
|
|
from contextlib import contextmanager
|
|
from dataclasses import is_dataclass
|
|
from datetime import datetime
|
|
from enum import IntEnum
|
|
from functools import lru_cache
|
|
from importlib.metadata import version
|
|
from pathlib import Path
|
|
from typing import TYPE_CHECKING, Any, Literal, TypeVar, get_args
|
|
|
|
import torch
|
|
from packaging.version import Version
|
|
from pydantic import ConfigDict, Field, model_validator
|
|
|
|
import vllm.envs as envs
|
|
from vllm.logger import enable_trace_function_call, init_logger
|
|
from vllm.transformers_utils.runai_utils import is_runai_obj_uri
|
|
from vllm.utils import random_uuid
|
|
from vllm.utils.hashing import safe_hash
|
|
|
|
from .attention import AttentionConfig
|
|
from .cache import CacheConfig
|
|
from .compilation import CompilationConfig, CompilationMode, CUDAGraphMode
|
|
from .device import DeviceConfig
|
|
from .ec_transfer import ECTransferConfig
|
|
from .kernel import KernelConfig
|
|
from .kv_events import KVEventsConfig
|
|
from .kv_transfer import KVTransferConfig
|
|
from .load import LoadConfig
|
|
from .lora import LoRAConfig
|
|
from .model import ModelConfig
|
|
from .observability import ObservabilityConfig
|
|
from .offload import OffloadConfig
|
|
from .parallel import ParallelConfig
|
|
from .profiler import ProfilerConfig
|
|
from .reasoning import ReasoningConfig
|
|
from .scheduler import SchedulerConfig
|
|
from .speculative import EagleModelTypes, NgramGPUTypes, SpeculativeConfig
|
|
from .structured_outputs import StructuredOutputsConfig
|
|
from .utils import SupportsHash, config, replace
|
|
from .weight_transfer import WeightTransferConfig
|
|
|
|
if TYPE_CHECKING:
|
|
from transformers import PretrainedConfig
|
|
|
|
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
|
from vllm.v1.kv_cache_interface import KVCacheConfig
|
|
else:
|
|
PretrainedConfig = Any
|
|
|
|
QuantizationConfig = Any
|
|
|
|
KVCacheConfig = Any
|
|
|
|
logger = init_logger(__name__)
|
|
|
|
|
|
class OptimizationLevel(IntEnum):
|
|
"""Optimization level enum."""
|
|
|
|
O0 = 0
|
|
"""O0 : No optimization. no compilation, no cudagraphs, no other
|
|
optimization, just starting up immediately"""
|
|
O1 = 1
|
|
"""O1: Quick optimizations. Dynamo+Inductor compilation and Piecewise
|
|
cudagraphs"""
|
|
O2 = 2
|
|
"""O2: Full optimizations. -O1 as well as Full and Piecewise cudagraphs."""
|
|
O3 = 3
|
|
"""O3: Currently the same as -O2s."""
|
|
|
|
|
|
PerformanceMode = Literal["balanced", "interactivity", "throughput"]
|
|
|
|
IS_QUANTIZED = False
|
|
IS_DENSE = False
|
|
# The optimizations that depend on these properties currently set to False
|
|
# in all cases.
|
|
# if model_config is not None:
|
|
# IS_QUANTIZED = lambda c: c.model_config.is_quantized()
|
|
# IS_DENSE = lambda c: not c.model_config.is_model_moe()
|
|
# See https://github.com/vllm-project/vllm/issues/25689.
|
|
|
|
|
|
def enable_norm_fusion(cfg: "VllmConfig") -> bool:
|
|
"""Enable if either RMS norm or quant FP8 custom op is active;
|
|
otherwise Inductor handles fusion."""
|
|
|
|
return (
|
|
cfg.compilation_config.is_custom_op_enabled("rms_norm")
|
|
or cfg.compilation_config.is_custom_op_enabled("quant_fp8")
|
|
or cfg.kernel_config.ir_op_priority.rms_norm[0] != "native"
|
|
)
|
|
|
|
|
|
def enable_act_fusion(cfg: "VllmConfig") -> bool:
|
|
"""
|
|
Enable if either SiLU+Mul or quant FP8 custom op is active;
|
|
otherwise Inductor handles fusion.
|
|
Also enable for FP4 models as FP4 quant is always custom so Inductor cannot fuse it.
|
|
"""
|
|
return (
|
|
cfg.compilation_config.is_custom_op_enabled("silu_and_mul")
|
|
or cfg.compilation_config.is_custom_op_enabled("quant_fp8")
|
|
or (cfg.model_config is not None and cfg.model_config.is_nvfp4_quantized())
|
|
)
|
|
|
|
|
|
def enable_allreduce_rms_fusion(cfg: "VllmConfig") -> bool:
|
|
"""Enable if TP > 1 and Hopper/Blackwell and flashinfer installed."""
|
|
from vllm.platforms import current_platform
|
|
from vllm.utils.flashinfer import has_flashinfer
|
|
|
|
return (
|
|
cfg.parallel_config.tensor_parallel_size > 1
|
|
and current_platform.is_cuda()
|
|
and has_flashinfer()
|
|
and (
|
|
current_platform.is_device_capability_family(100)
|
|
or current_platform.is_device_capability(90)
|
|
)
|
|
# tp-dp combination broken:
|
|
# https://github.com/vllm-project/vllm/issues/34458
|
|
and cfg.parallel_config.data_parallel_size == 1
|
|
# tp-pp combination broken:
|
|
# https://github.com/vllm-project/vllm/issues/35426
|
|
and cfg.parallel_config.pipeline_parallel_size == 1
|
|
)
|
|
|
|
|
|
def enable_rope_kvcache_fusion(cfg: "VllmConfig") -> bool:
|
|
"""Enable if rotary embedding custom op is active and
|
|
use_inductor_graph_partition is enabled.
|
|
"""
|
|
from vllm._aiter_ops import rocm_aiter_ops
|
|
|
|
return (
|
|
rocm_aiter_ops.is_enabled()
|
|
and cfg.compilation_config.is_custom_op_enabled("rotary_embedding")
|
|
and (
|
|
cfg.compilation_config.use_inductor_graph_partition
|
|
or not cfg.compilation_config.splitting_ops_contain_kv_cache_update()
|
|
)
|
|
)
|
|
|
|
|
|
def enable_norm_pad_fusion(cfg: "VllmConfig") -> bool:
|
|
"""Enable if using AITER RMSNorm and hidden size is 2880 i.e. gpt-oss."""
|
|
from vllm._aiter_ops import rocm_aiter_ops
|
|
|
|
return (
|
|
rocm_aiter_ops.is_rmsnorm_enabled()
|
|
and cfg.model_config is not None
|
|
and cfg.model_config.get_hidden_size() == 2880
|
|
)
|
|
|
|
|
|
OPTIMIZATION_LEVEL_00 = {
|
|
"compilation_config": {
|
|
"pass_config": {
|
|
"fuse_norm_quant": False,
|
|
"fuse_act_quant": False,
|
|
"fuse_allreduce_rms": False,
|
|
"fuse_attn_quant": False,
|
|
"enable_sp": False,
|
|
"fuse_gemm_comms": False,
|
|
"fuse_act_padding": False,
|
|
"fuse_rope_kvcache": False,
|
|
},
|
|
"cudagraph_mode": CUDAGraphMode.NONE,
|
|
"use_inductor_graph_partition": False,
|
|
},
|
|
"kernel_config": {
|
|
"enable_flashinfer_autotune": False,
|
|
},
|
|
}
|
|
OPTIMIZATION_LEVEL_01 = {
|
|
"compilation_config": {
|
|
"pass_config": {
|
|
"fuse_norm_quant": enable_norm_fusion,
|
|
"fuse_act_quant": enable_act_fusion,
|
|
"fuse_allreduce_rms": False,
|
|
"fuse_attn_quant": False,
|
|
"enable_sp": False,
|
|
"fuse_gemm_comms": False,
|
|
"fuse_act_padding": enable_norm_pad_fusion,
|
|
"fuse_rope_kvcache": False,
|
|
},
|
|
"cudagraph_mode": CUDAGraphMode.PIECEWISE,
|
|
"use_inductor_graph_partition": False,
|
|
},
|
|
"kernel_config": {
|
|
"enable_flashinfer_autotune": True,
|
|
},
|
|
}
|
|
OPTIMIZATION_LEVEL_02 = {
|
|
"compilation_config": {
|
|
"pass_config": {
|
|
"fuse_norm_quant": enable_norm_fusion,
|
|
"fuse_act_quant": enable_act_fusion,
|
|
"fuse_allreduce_rms": enable_allreduce_rms_fusion,
|
|
"fuse_attn_quant": IS_QUANTIZED,
|
|
"enable_sp": IS_DENSE,
|
|
"fuse_gemm_comms": IS_DENSE,
|
|
"fuse_act_padding": enable_norm_pad_fusion,
|
|
"fuse_rope_kvcache": enable_rope_kvcache_fusion,
|
|
},
|
|
"cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
|
|
"use_inductor_graph_partition": False,
|
|
},
|
|
"kernel_config": {
|
|
"enable_flashinfer_autotune": True,
|
|
},
|
|
}
|
|
OPTIMIZATION_LEVEL_03 = {
|
|
"compilation_config": {
|
|
"pass_config": {
|
|
"fuse_norm_quant": enable_norm_fusion,
|
|
"fuse_act_quant": enable_act_fusion,
|
|
"fuse_allreduce_rms": enable_allreduce_rms_fusion,
|
|
"fuse_attn_quant": IS_QUANTIZED,
|
|
"enable_sp": IS_DENSE,
|
|
"fuse_gemm_comms": IS_DENSE,
|
|
"fuse_act_padding": enable_norm_pad_fusion,
|
|
"fuse_rope_kvcache": enable_rope_kvcache_fusion,
|
|
},
|
|
"cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
|
|
"use_inductor_graph_partition": False,
|
|
},
|
|
"kernel_config": {
|
|
"enable_flashinfer_autotune": True,
|
|
},
|
|
}
|
|
|
|
OPTIMIZATION_LEVEL_TO_CONFIG = {
|
|
OptimizationLevel.O0: OPTIMIZATION_LEVEL_00,
|
|
OptimizationLevel.O1: OPTIMIZATION_LEVEL_01,
|
|
OptimizationLevel.O2: OPTIMIZATION_LEVEL_02,
|
|
OptimizationLevel.O3: OPTIMIZATION_LEVEL_03,
|
|
}
|
|
|
|
|
|
@config(config=ConfigDict(arbitrary_types_allowed=True))
|
|
class VllmConfig:
|
|
"""Dataclass which contains all vllm-related configuration. This
|
|
simplifies passing around the distinct configurations in the codebase.
|
|
"""
|
|
|
|
# TODO: use default_factory once default constructing ModelConfig doesn't
|
|
# try to download a model
|
|
model_config: ModelConfig = None # type: ignore[assignment]
|
|
"""Model configuration."""
|
|
cache_config: CacheConfig = Field(default_factory=CacheConfig)
|
|
"""Cache configuration."""
|
|
parallel_config: ParallelConfig = Field(default_factory=ParallelConfig)
|
|
"""Parallel configuration."""
|
|
scheduler_config: SchedulerConfig = Field(
|
|
default_factory=SchedulerConfig.default_factory,
|
|
)
|
|
"""Scheduler configuration."""
|
|
device_config: DeviceConfig = Field(default_factory=DeviceConfig)
|
|
"""Device configuration."""
|
|
load_config: LoadConfig = Field(default_factory=LoadConfig)
|
|
"""Load configuration."""
|
|
offload_config: OffloadConfig = Field(default_factory=OffloadConfig)
|
|
"""Model weight offloading configuration."""
|
|
attention_config: AttentionConfig = Field(default_factory=AttentionConfig)
|
|
"""Attention configuration."""
|
|
kernel_config: KernelConfig = Field(default_factory=KernelConfig)
|
|
"""Kernel configuration."""
|
|
lora_config: LoRAConfig | None = None
|
|
"""LoRA configuration."""
|
|
speculative_config: SpeculativeConfig | None = None
|
|
"""Speculative decoding configuration."""
|
|
structured_outputs_config: StructuredOutputsConfig = Field(
|
|
default_factory=StructuredOutputsConfig
|
|
)
|
|
"""Structured outputs configuration."""
|
|
observability_config: ObservabilityConfig = Field(
|
|
default_factory=ObservabilityConfig
|
|
)
|
|
"""Observability configuration."""
|
|
quant_config: QuantizationConfig | None = None
|
|
"""Quantization configuration."""
|
|
compilation_config: CompilationConfig = Field(default_factory=CompilationConfig)
|
|
"""`torch.compile` and cudagraph capture configuration for the model.
|
|
|
|
As a shorthand, one can append compilation arguments via
|
|
-cc.parameter=argument such as `-cc.mode=3` (same as `-cc='{"mode":3}'`).
|
|
|
|
You can specify the full compilation config like so:
|
|
`{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
|
|
"""
|
|
profiler_config: ProfilerConfig = Field(default_factory=ProfilerConfig)
|
|
"""Profiling configuration."""
|
|
kv_transfer_config: KVTransferConfig | None = None
|
|
"""The configurations for distributed KV cache transfer."""
|
|
kv_events_config: KVEventsConfig | None = None
|
|
"""The configurations for event publishing."""
|
|
ec_transfer_config: ECTransferConfig | None = None
|
|
"""The configurations for distributed EC cache transfer."""
|
|
reasoning_config: ReasoningConfig | None = None
|
|
"""The configurations for reasoning model."""
|
|
# some opaque config, only used to provide additional information
|
|
# for the hash computation, mainly used for testing, debugging or out of
|
|
# tree config registration.
|
|
additional_config: dict | SupportsHash = Field(default_factory=dict)
|
|
"""Additional config for specified platform. Different platforms may
|
|
support different configs. Make sure the configs are valid for the platform
|
|
you are using. Contents must be hashable."""
|
|
instance_id: str = ""
|
|
"""The ID of the vLLM instance."""
|
|
optimization_level: OptimizationLevel = OptimizationLevel.O2
|
|
"""The optimization level. These levels trade startup time cost for
|
|
performance, with -O0 having the best startup time and -O3 having the best
|
|
performance. -O2 is used by default. See OptimizationLevel for full
|
|
description."""
|
|
|
|
performance_mode: PerformanceMode = "balanced"
|
|
"""Performance mode for runtime behavior, 'balanced' is the default.
|
|
'interactivity' favors low end-to-end per-request latency at small batch
|
|
sizes (fine-grained CUDA graphs, latency-oriented kernels).
|
|
'throughput' favors aggregate tokens/sec at high concurrency (larger CUDA
|
|
graphs, more aggressive batching, throughput-oriented kernels)."""
|
|
|
|
weight_transfer_config: WeightTransferConfig | None = None
|
|
"""The configurations for weight transfer during RL training."""
|
|
|
|
shutdown_timeout: int = Field(default=0, ge=0)
|
|
"""Shutdown grace period for in-flight requests. Shutdown will be delayed for
|
|
up to this amount of time to allow already-running requests to complete. Any
|
|
remaining requests are aborted once the timeout is reached.
|
|
"""
|
|
|
|
def compute_hash(self) -> str:
|
|
"""
|
|
WARNING: Whenever a new field is added to this config,
|
|
ensure that it is included in the factors list if
|
|
it affects the computation graph.
|
|
|
|
Provide a hash that uniquely identifies all the configs
|
|
that affect the structure of the computation
|
|
graph from input ids/embeddings to the final hidden states,
|
|
excluding anything before input ids/embeddings and after
|
|
the final hidden states.
|
|
"""
|
|
factors: list[Any] = []
|
|
|
|
# summarize vllm config
|
|
vllm_factors: list[Any] = []
|
|
from vllm import __version__
|
|
|
|
vllm_factors.append(__version__)
|
|
if self.model_config:
|
|
vllm_factors.append(self.model_config.compute_hash())
|
|
if (
|
|
self.compilation_config
|
|
and getattr(self.compilation_config, "compile_mm_encoder", False)
|
|
and self.model_config.multimodal_config
|
|
):
|
|
vllm_factors.append(self.model_config.multimodal_config.compute_hash())
|
|
else:
|
|
vllm_factors.append("None")
|
|
if self.cache_config:
|
|
vllm_factors.append(self.cache_config.compute_hash())
|
|
else:
|
|
vllm_factors.append("None")
|
|
if self.parallel_config:
|
|
vllm_factors.append(self.parallel_config.compute_hash())
|
|
else:
|
|
vllm_factors.append("None")
|
|
if self.scheduler_config:
|
|
vllm_factors.append(self.scheduler_config.compute_hash())
|
|
else:
|
|
vllm_factors.append("None")
|
|
if self.device_config:
|
|
vllm_factors.append(self.device_config.compute_hash())
|
|
else:
|
|
vllm_factors.append("None")
|
|
if self.load_config:
|
|
vllm_factors.append(self.load_config.compute_hash())
|
|
else:
|
|
vllm_factors.append("None")
|
|
if self.offload_config:
|
|
vllm_factors.append(self.offload_config.compute_hash())
|
|
else:
|
|
vllm_factors.append("None")
|
|
if self.attention_config:
|
|
vllm_factors.append(self.attention_config.compute_hash())
|
|
else:
|
|
vllm_factors.append("None")
|
|
if self.lora_config:
|
|
vllm_factors.append(self.lora_config.compute_hash())
|
|
else:
|
|
vllm_factors.append("None")
|
|
if self.speculative_config:
|
|
vllm_factors.append(self.speculative_config.compute_hash())
|
|
else:
|
|
vllm_factors.append("None")
|
|
if self.structured_outputs_config:
|
|
vllm_factors.append(self.structured_outputs_config.compute_hash())
|
|
if self.profiler_config:
|
|
vllm_factors.append(self.profiler_config.compute_hash())
|
|
else:
|
|
vllm_factors.append("None")
|
|
vllm_factors.append(self.observability_config.compute_hash())
|
|
if self.quant_config:
|
|
pass # should be captured by model_config.quantization
|
|
if self.compilation_config:
|
|
vllm_factors.append(self.compilation_config.compute_hash())
|
|
else:
|
|
vllm_factors.append("None")
|
|
if self.kernel_config:
|
|
vllm_factors.append(self.kernel_config.compute_hash())
|
|
else:
|
|
vllm_factors.append(None)
|
|
if self.kv_transfer_config:
|
|
vllm_factors.append(self.kv_transfer_config.compute_hash())
|
|
else:
|
|
vllm_factors.append("None")
|
|
if self.ec_transfer_config:
|
|
vllm_factors.append(self.ec_transfer_config.compute_hash())
|
|
else:
|
|
vllm_factors.append("None")
|
|
if self.additional_config:
|
|
if isinstance(additional_config := self.additional_config, dict):
|
|
additional_config_hash = safe_hash(
|
|
json.dumps(additional_config, sort_keys=True).encode(),
|
|
usedforsecurity=False,
|
|
).hexdigest()
|
|
else:
|
|
additional_config_hash = additional_config.compute_hash()
|
|
vllm_factors.append(additional_config_hash)
|
|
else:
|
|
vllm_factors.append("None")
|
|
factors.append(vllm_factors)
|
|
|
|
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[
|
|
:10
|
|
]
|
|
return hash_str
|
|
|
|
@property
|
|
def num_speculative_tokens(self) -> int:
|
|
if (
|
|
self.speculative_config is not None
|
|
and self.speculative_config.num_speculative_tokens is not None
|
|
):
|
|
return self.speculative_config.num_speculative_tokens
|
|
return 0
|
|
|
|
@property
|
|
def needs_dp_coordinator(self) -> bool:
|
|
"""
|
|
Determine if the DPCoordinator process is needed.
|
|
|
|
The DPCoordinator is needed in two cases:
|
|
1. For MoE models with DP > 1: to handle wave coordination
|
|
(even in external LB mode, since wave coordination runs in the coordinator)
|
|
2. For non-MoE models in internal/hybrid LB mode: to collect and publish
|
|
queue stats for load balancing across DP ranks
|
|
|
|
Returns:
|
|
True if DPCoordinator process is needed, False otherwise.
|
|
"""
|
|
|
|
# For non-MoE models, only need coordinator in internal/hybrid LB mode
|
|
# (for stats collection).
|
|
return self.parallel_config.data_parallel_size > 1 and (
|
|
self.model_config is None
|
|
or self.model_config.is_moe
|
|
or not self.parallel_config.data_parallel_external_lb
|
|
)
|
|
|
|
def enable_trace_function_call_for_thread(self) -> None:
|
|
"""
|
|
Set up function tracing for the current thread,
|
|
if enabled via the `VLLM_TRACE_FUNCTION` environment variable.
|
|
"""
|
|
if envs.VLLM_TRACE_FUNCTION:
|
|
tmp_dir = tempfile.gettempdir()
|
|
# add username to tmp_dir to avoid permission issues
|
|
tmp_dir = os.path.join(tmp_dir, getpass.getuser())
|
|
filename = (
|
|
f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}"
|
|
f"_thread_{threading.get_ident()}_at_{datetime.now()}.log"
|
|
).replace(" ", "_")
|
|
log_path = os.path.join(
|
|
tmp_dir,
|
|
"vllm",
|
|
f"vllm-instance-{self.instance_id}",
|
|
filename,
|
|
)
|
|
os.makedirs(os.path.dirname(log_path), exist_ok=True)
|
|
enable_trace_function_call(log_path)
|
|
|
|
@staticmethod
|
|
def _get_quantization_config(
|
|
model_config: ModelConfig, load_config: LoadConfig
|
|
) -> QuantizationConfig | None:
|
|
"""Get the quantization config."""
|
|
from vllm.platforms import current_platform
|
|
|
|
if model_config.quantization is not None:
|
|
from vllm.model_executor.model_loader.weight_utils import get_quant_config
|
|
|
|
quant_config = get_quant_config(model_config, load_config)
|
|
capability_tuple = current_platform.get_device_capability()
|
|
|
|
if capability_tuple is not None:
|
|
capability = capability_tuple.to_int()
|
|
if capability < quant_config.get_min_capability():
|
|
raise ValueError(
|
|
f"The quantization method {model_config.quantization} "
|
|
"is not supported for the current GPU. Minimum "
|
|
f"capability: {quant_config.get_min_capability()}. "
|
|
f"Current capability: {capability}."
|
|
)
|
|
supported_dtypes = quant_config.get_supported_act_dtypes()
|
|
if model_config.dtype not in supported_dtypes:
|
|
raise ValueError(
|
|
f"{model_config.dtype} is not supported for quantization "
|
|
f"method {model_config.quantization}. Supported dtypes: "
|
|
f"{supported_dtypes}"
|
|
)
|
|
quant_config.maybe_update_config(
|
|
model_config.model,
|
|
hf_config=model_config.hf_config,
|
|
)
|
|
return quant_config
|
|
return None
|
|
|
|
@staticmethod
|
|
def get_quantization_config(
|
|
model_config: ModelConfig, load_config: LoadConfig
|
|
) -> QuantizationConfig | None:
|
|
import copy
|
|
|
|
# For some reason, the _ version of this modifies the model_config
|
|
# object, so using deepcopy to avoid this problem.
|
|
return VllmConfig._get_quantization_config(
|
|
copy.deepcopy(model_config), load_config
|
|
)
|
|
|
|
def with_hf_config(
|
|
self,
|
|
hf_config: PretrainedConfig,
|
|
architectures: list[str] | None = None,
|
|
) -> "VllmConfig":
|
|
if architectures is not None:
|
|
hf_config = copy.deepcopy(hf_config)
|
|
hf_config.architectures = architectures
|
|
|
|
model_config = copy.deepcopy(self.model_config)
|
|
|
|
# In Transformers v5, tie_word_embeddings belongs to the config of the class
|
|
# that can see both layers to be tied. For example:
|
|
#
|
|
# SomeVLModel:
|
|
# self.language_model = SomeLanguageModel(SomeVLTextConfig)
|
|
# self.vision_model = SomeVisionModel(SomeVLVisionConfig)
|
|
#
|
|
# SomeVLModelForMultimodalLM:
|
|
# self.model = SomeVLModel(SomeVLConfig)
|
|
# self.lm_head = nn.Linear()
|
|
#
|
|
# Therefore, tie_word_embeddings is defined in SomeVLConfig and is not present
|
|
# in SomeVLTextConfig*. In vLLM, the lm_head belongs to the language_model, so
|
|
# we must ensure that tie_word_embeddings is set in the language_model's config.
|
|
#
|
|
# *For some models, SomeVLTextConfig may also have a tie_word_embeddings field.
|
|
# This is only the case if SomeVLTextConfig is also used for a text only version
|
|
# of the same model. For example:
|
|
#
|
|
# SomeVLModelForCausalLM:
|
|
# self.model = SomeLanguageModel(SomeVLTextConfig)
|
|
# self.lm_head = nn.Linear()
|
|
#
|
|
# Therefore, the presence of tie_word_embeddings in SomeVLTextConfig cannot
|
|
# be used as a signal for whether tie_word_embeddings should be copied from
|
|
# hf_config to the language_model config.
|
|
if (
|
|
Version(version("transformers")) >= Version("5.0.0")
|
|
and model_config.is_multimodal_model
|
|
and hasattr(model_config.hf_config, "tie_word_embeddings")
|
|
):
|
|
tie_word_embeddings = model_config.hf_config.tie_word_embeddings
|
|
hf_config.get_text_config().tie_word_embeddings = tie_word_embeddings
|
|
|
|
model_config.hf_config = hf_config
|
|
model_config.model_arch_config = model_config.get_model_arch_config()
|
|
|
|
return replace(self, model_config=model_config)
|
|
|
|
def _set_config_default(self, config_obj: Any, key: str, value: Any) -> None:
|
|
"""Set config attribute to default if not already set by user.
|
|
|
|
Args:
|
|
config_obj: Configuration object to update.
|
|
key: Attribute name.
|
|
value: Default value (static or callable).
|
|
"""
|
|
if getattr(config_obj, key) is None:
|
|
# Some config values are known before initialization and are
|
|
# hard coded.
|
|
# Other values depend on the user given configuration, so they are
|
|
# implemented with lambda functions and decided at run time.
|
|
setattr(config_obj, key, value(self) if callable(value) else value)
|
|
|
|
def _apply_optimization_level_defaults(self, defaults: dict[str, Any]) -> None:
|
|
"""Apply optimization level defaults using self as root.
|
|
|
|
Recursively applies values from defaults into nested config objects.
|
|
Only fields present in defaults are overwritten.
|
|
|
|
If the user configuration does not specify a value for a default field
|
|
and if the default field is still None after all user selections are
|
|
applied, then default values will be applied to the field. User specified
|
|
fields will not be overridden by the default.
|
|
|
|
Args:
|
|
defaults: Dictionary of default values to apply.
|
|
"""
|
|
|
|
def apply_recursive(config_obj: Any, config_defaults: dict[str, Any]) -> None:
|
|
"""Recursively apply defaults to config_obj, using self as root."""
|
|
for key, value in config_defaults.items():
|
|
if not hasattr(config_obj, key):
|
|
continue
|
|
|
|
current = getattr(config_obj, key)
|
|
if isinstance(value, dict) and is_dataclass(current):
|
|
apply_recursive(current, value)
|
|
else:
|
|
self._set_config_default(config_obj, key, value)
|
|
|
|
apply_recursive(self, defaults)
|
|
|
|
def _post_init_kv_transfer_config(self) -> None:
|
|
"""Update KVTransferConfig based on top-level configs in VllmConfig.
|
|
|
|
Right now, this function reads the offloading settings from
|
|
CacheConfig and configures the KVTransferConfig accordingly.
|
|
"""
|
|
# KV offloading is only activated when kv_offloading_size is set.
|
|
if (kv_offloading_size := self.cache_config.kv_offloading_size) is None:
|
|
return
|
|
|
|
kv_offloading_backend = self.cache_config.kv_offloading_backend
|
|
|
|
# If no KVTransferConfig is provided, create a default one.
|
|
if self.kv_transfer_config is None:
|
|
self.kv_transfer_config = KVTransferConfig()
|
|
num_kv_ranks = (
|
|
self.parallel_config.tensor_parallel_size
|
|
* self.parallel_config.pipeline_parallel_size
|
|
)
|
|
|
|
if kv_offloading_backend == "native":
|
|
if envs.VLLM_USE_SIMPLE_KV_OFFLOAD:
|
|
config_connector = "SimpleCPUOffloadConnector"
|
|
else:
|
|
config_connector = "OffloadingConnector"
|
|
self.kv_transfer_config.kv_connector = config_connector
|
|
self.kv_transfer_config.kv_connector_extra_config.update(
|
|
{"cpu_bytes_to_use": kv_offloading_size * (1 << 30)}
|
|
)
|
|
elif kv_offloading_backend == "lmcache":
|
|
self.kv_transfer_config.kv_connector = "LMCacheConnectorV1"
|
|
kv_gb_per_rank = kv_offloading_size / num_kv_ranks
|
|
self.kv_transfer_config.kv_connector_extra_config = {
|
|
"lmcache.local_cpu": True,
|
|
"lmcache.max_local_cpu_size": kv_gb_per_rank,
|
|
}
|
|
|
|
# This is the same for all backends
|
|
self.kv_transfer_config.kv_role = "kv_both"
|
|
|
|
def __post_init__(self):
|
|
"""Verify configs are valid & consistent with each other."""
|
|
|
|
# To give each torch profile run a unique instance name.
|
|
self.instance_id = f"{time.time_ns()}"
|
|
|
|
if self.performance_mode != "balanced":
|
|
logger.info_once(
|
|
"Performance mode set to '%s'.", self.performance_mode, scope="local"
|
|
)
|
|
|
|
self.try_verify_and_update_config()
|
|
|
|
if self.model_config is not None:
|
|
self.model_config.verify_with_parallel_config(self.parallel_config)
|
|
self.model_config.verify_dual_chunk_attention_config(self.load_config)
|
|
|
|
self.parallel_config.is_moe_model = self.model_config.is_moe
|
|
|
|
if self.lora_config is not None:
|
|
self.lora_config.verify_with_model_config(self.model_config)
|
|
|
|
if self.quant_config is None and self.model_config is not None:
|
|
self.quant_config = VllmConfig._get_quantization_config(
|
|
self.model_config, self.load_config
|
|
)
|
|
|
|
if (
|
|
self.quant_config is not None
|
|
and self.model_config is not None
|
|
and hasattr(self.quant_config, "use_deep_gemm")
|
|
and self.quant_config.use_deep_gemm is None
|
|
):
|
|
from vllm.utils.deep_gemm import should_auto_disable_deep_gemm
|
|
|
|
model_type = getattr(self.model_config.hf_text_config, "model_type", None)
|
|
if should_auto_disable_deep_gemm(model_type):
|
|
self.quant_config.use_deep_gemm = False
|
|
logger.warning_once(
|
|
"Auto-disabled DeepGemm for model_type=%s on Blackwell. "
|
|
"DeepGemm E8M0 scale format causes accuracy degradation "
|
|
"for this architecture. Falling back to CUTLASS. "
|
|
"To disable DeepGemm globally, set VLLM_USE_DEEP_GEMM=0.",
|
|
model_type,
|
|
)
|
|
|
|
from vllm.v1.executor.abstract import Executor
|
|
|
|
executor_backend = self.parallel_config.distributed_executor_backend
|
|
executor_class = Executor.get_class(self)
|
|
executor_supports_async_sched = executor_class.supports_async_scheduling()
|
|
|
|
if self.scheduler_config.async_scheduling:
|
|
# Async scheduling explicitly enabled, hard fail any incompatibilities.
|
|
# Currently, async scheduling only support eagle speculative
|
|
# decoding.
|
|
if self.speculative_config is not None:
|
|
if (
|
|
self.speculative_config.method not in get_args(EagleModelTypes)
|
|
and self.speculative_config.method not in get_args(NgramGPUTypes)
|
|
and self.speculative_config.method != "draft_model"
|
|
):
|
|
raise ValueError(
|
|
"Currently, async scheduling is only supported "
|
|
"with EAGLE/MTP/Draft Model/NGram GPU kind of "
|
|
"speculative decoding"
|
|
)
|
|
if self.speculative_config.disable_padded_drafter_batch:
|
|
raise ValueError(
|
|
"Async scheduling is not compatible with "
|
|
"disable_padded_drafter_batch=True."
|
|
)
|
|
if not executor_supports_async_sched:
|
|
raise ValueError(
|
|
f"`{executor_backend}` does not support async scheduling yet."
|
|
)
|
|
elif self.scheduler_config.async_scheduling is None:
|
|
# Enable async scheduling unless there is an incompatible option.
|
|
if (
|
|
self.speculative_config is not None
|
|
and self.speculative_config.method not in get_args(EagleModelTypes)
|
|
and self.speculative_config.method not in get_args(NgramGPUTypes)
|
|
):
|
|
logger.warning_once(
|
|
"Async scheduling not supported with %s-based "
|
|
"speculative decoding and will be disabled.",
|
|
self.speculative_config.method,
|
|
scope="local",
|
|
)
|
|
self.scheduler_config.async_scheduling = False
|
|
elif (
|
|
self.speculative_config is not None
|
|
and self.speculative_config.disable_padded_drafter_batch
|
|
):
|
|
logger.warning_once(
|
|
"Async scheduling is not compatible with "
|
|
"disable_padded_drafter_batch=True and will be disabled.",
|
|
scope="local",
|
|
)
|
|
self.scheduler_config.async_scheduling = False
|
|
elif not executor_supports_async_sched:
|
|
logger.warning_once(
|
|
"Async scheduling will be disabled because it is not supported "
|
|
"with the `%s` distributed executor backend. ",
|
|
executor_backend,
|
|
scope="local",
|
|
)
|
|
self.scheduler_config.async_scheduling = False
|
|
else:
|
|
self.scheduler_config.async_scheduling = True
|
|
|
|
logger.info_once(
|
|
"Asynchronous scheduling is %s.",
|
|
"enabled" if self.scheduler_config.async_scheduling else "disabled",
|
|
)
|
|
|
|
if self.parallel_config.disable_nccl_for_dp_synchronization is None:
|
|
if self.scheduler_config.async_scheduling:
|
|
if self.parallel_config.data_parallel_size > 1 and (
|
|
self.model_config is None or self.model_config.is_moe
|
|
):
|
|
logger.info_once(
|
|
"Disabling NCCL for DP synchronization "
|
|
"when using async scheduling.",
|
|
scope="local",
|
|
)
|
|
self.parallel_config.disable_nccl_for_dp_synchronization = True
|
|
else:
|
|
self.parallel_config.disable_nccl_for_dp_synchronization = False
|
|
|
|
if (
|
|
self.speculative_config is not None
|
|
and self.scheduler_config.async_scheduling
|
|
and self.model_config is not None
|
|
and not self.model_config.disable_cascade_attn
|
|
):
|
|
logger.warning_once(
|
|
"Disabling cascade attention (not yet compatible with "
|
|
"async speculative decoding).",
|
|
scope="local",
|
|
)
|
|
self.model_config.disable_cascade_attn = True
|
|
|
|
if (
|
|
self.model_config is not None
|
|
and self.model_config.multimodal_config is not None
|
|
and self.model_config.multimodal_config.mm_tensor_ipc == "torch_shm"
|
|
and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"
|
|
):
|
|
raise ValueError(
|
|
"torch_shm is known to fail without "
|
|
"VLLM_WORKER_MULTIPROC_METHOD set to spawn"
|
|
)
|
|
|
|
from vllm.platforms import current_platform
|
|
|
|
if (
|
|
self.model_config is not None
|
|
and self.scheduler_config.enable_chunked_prefill
|
|
and self.model_config.dtype == torch.float32
|
|
and current_platform.get_device_capability() == (7, 5)
|
|
):
|
|
logger.warning_once(
|
|
"Turing devices tensor cores do not support float32 matmul. "
|
|
"To workaround this limitation, vLLM will set 'ieee' input "
|
|
"precision for chunked prefill triton kernels."
|
|
)
|
|
|
|
if self.model_config is not None and self.model_config.enforce_eager:
|
|
logger.warning(
|
|
"Enforce eager set, disabling torch.compile and CUDAGraphs. "
|
|
"This is equivalent to setting -cc.mode=none -cc.cudagraph_mode=none"
|
|
)
|
|
self.compilation_config.mode = CompilationMode.NONE
|
|
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
|
|
|
if self.compilation_config.backend == "eager" or (
|
|
self.compilation_config.mode is not None
|
|
and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
|
|
):
|
|
logger.warning(
|
|
"Inductor compilation was disabled by user settings, "
|
|
"optimizations settings that are only active during "
|
|
"inductor compilation will be ignored."
|
|
)
|
|
|
|
def has_blocked_weights():
|
|
if self.quant_config is not None:
|
|
if hasattr(self.quant_config, "weight_block_size"):
|
|
return self.quant_config.weight_block_size is not None
|
|
elif hasattr(self.quant_config, "has_blocked_weights"):
|
|
return self.quant_config.has_blocked_weights()
|
|
return False
|
|
|
|
# Enable quant_fp8 CUDA ops (TODO disable in follow up)
|
|
# On H100 the CUDA kernel is faster than
|
|
# native implementation
|
|
# https://github.com/vllm-project/vllm/issues/25094
|
|
if has_blocked_weights():
|
|
custom_ops = self.compilation_config.custom_ops
|
|
if "-quant_fp8" not in custom_ops:
|
|
custom_ops.append("+quant_fp8")
|
|
|
|
current_platform.apply_config_platform_defaults(self)
|
|
|
|
if self.compilation_config.mode is None:
|
|
if self.optimization_level > OptimizationLevel.O0:
|
|
self.compilation_config.mode = CompilationMode.VLLM_COMPILE
|
|
else:
|
|
self.compilation_config.mode = CompilationMode.NONE
|
|
|
|
# By default, enable torch wrapping only when using custom Inductor lowering
|
|
if self.compilation_config.ir_enable_torch_wrap is None:
|
|
self.compilation_config.ir_enable_torch_wrap = (
|
|
self.compilation_config.mode == CompilationMode.VLLM_COMPILE
|
|
and self.compilation_config.backend == "inductor"
|
|
)
|
|
|
|
if all(s not in self.compilation_config.custom_ops for s in ("all", "none")):
|
|
if (
|
|
self.compilation_config.backend == "inductor"
|
|
and self.compilation_config.mode != CompilationMode.NONE
|
|
):
|
|
self.compilation_config.custom_ops.append("none")
|
|
else:
|
|
self.compilation_config.custom_ops.append("all")
|
|
|
|
# This populates IR op priorities,
|
|
# must happen after compilation mode and backend are decided,
|
|
# but before fusion defaults are applied as those may depend on op priority.
|
|
self.kernel_config.set_platform_defaults(self)
|
|
|
|
default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level]
|
|
self._apply_optimization_level_defaults(default_config)
|
|
if self.kernel_config.enable_flashinfer_autotune is None:
|
|
raise ValueError(
|
|
"KernelConfig.enable_flashinfer_autotune must be set after applying "
|
|
"optimization level defaults."
|
|
)
|
|
|
|
if (
|
|
self.compilation_config.cudagraph_mode.requires_piecewise_compilation()
|
|
and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
|
|
):
|
|
logger.info(
|
|
"Cudagraph mode %s is not compatible with compilation mode %s."
|
|
"Overriding to NONE.",
|
|
self.compilation_config.cudagraph_mode,
|
|
self.compilation_config.mode,
|
|
)
|
|
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
|
|
|
# async tp is built on top of sequence parallelism
|
|
# and requires it to be enabled.
|
|
if self.compilation_config.pass_config.fuse_gemm_comms:
|
|
self.compilation_config.pass_config.enable_sp = True
|
|
if self.compilation_config.pass_config.enable_sp:
|
|
if self.parallel_config.tensor_parallel_size == 1:
|
|
logger.warning("Sequence Parallelism requires TP>1, disabling")
|
|
self.compilation_config.pass_config.enable_sp = False
|
|
self.compilation_config.pass_config.fuse_gemm_comms = False
|
|
else:
|
|
# Compute SP threshold early; disable if None (model too
|
|
# small for SP to be beneficial).
|
|
pass_config = self.compilation_config.pass_config
|
|
if pass_config.sp_min_token_num is None:
|
|
from vllm.compilation.passes.fusion.sequence_parallelism import (
|
|
get_sequence_parallelism_threshold,
|
|
)
|
|
|
|
tp_size = self.parallel_config.tensor_parallel_size
|
|
hidden_size = self.model_config.get_hidden_size()
|
|
assert isinstance(self.model_config.dtype, torch.dtype)
|
|
element_size = self.model_config.dtype.itemsize
|
|
pass_config.sp_min_token_num = get_sequence_parallelism_threshold(
|
|
hidden_size, tp_size, element_size
|
|
)
|
|
|
|
if pass_config.sp_min_token_num is None:
|
|
logger.warning(
|
|
"Model hidden_size too small for the SP "
|
|
"threshold heuristic, disabling. To force SP, "
|
|
"set pass_config.sp_min_token_num manually."
|
|
)
|
|
self.compilation_config.pass_config.enable_sp = False
|
|
self.compilation_config.pass_config.fuse_gemm_comms = False
|
|
|
|
from vllm.utils.torch_utils import HAS_OPAQUE_TYPE
|
|
|
|
if HAS_OPAQUE_TYPE:
|
|
# On torch >= 2.11 the hoisted OpaqueObject approach supersedes
|
|
# fast_moe_cold_start, so force it off.
|
|
self.compilation_config.fast_moe_cold_start = False
|
|
elif self.compilation_config.fast_moe_cold_start is None:
|
|
# resolve default behavior: try to be as safe as possible
|
|
# this config is unsafe if any spec decoding draft model has a MOE.
|
|
# We'll conservatively turn it off if we see spec decoding.
|
|
self.compilation_config.fast_moe_cold_start = (
|
|
self.speculative_config is None
|
|
)
|
|
|
|
self._set_max_num_scheduled_tokens()
|
|
|
|
if current_platform.support_static_graph_mode():
|
|
# if cudagraph_mode has full cudagraphs, we need to check support
|
|
if model_config := self.model_config:
|
|
if (
|
|
self.compilation_config.cudagraph_mode.has_full_cudagraphs()
|
|
and model_config.pooler_config is not None
|
|
):
|
|
logger.warning_once(
|
|
"Pooling models do not support full cudagraphs. "
|
|
"Overriding cudagraph_mode to PIECEWISE."
|
|
)
|
|
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
|
elif (
|
|
model_config.is_encoder_decoder
|
|
and self.compilation_config.cudagraph_mode
|
|
not in (CUDAGraphMode.NONE, CUDAGraphMode.FULL_DECODE_ONLY)
|
|
):
|
|
logger.info_once(
|
|
"Encoder-decoder models do not support %s. "
|
|
"Overriding cudagraph_mode to FULL_DECODE_ONLY.",
|
|
self.compilation_config.cudagraph_mode.name,
|
|
)
|
|
self.compilation_config.cudagraph_mode = (
|
|
CUDAGraphMode.FULL_DECODE_ONLY
|
|
)
|
|
|
|
# Check if KV connector requires PIECEWISE mode for CUDA graphs
|
|
if (
|
|
self.kv_transfer_config is not None
|
|
and self.kv_transfer_config.is_kv_transfer_instance
|
|
and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
|
|
):
|
|
# Lazy import to avoid circular dependencies
|
|
from vllm.distributed.kv_transfer.kv_connector.factory import (
|
|
KVConnectorFactory,
|
|
)
|
|
|
|
connector_cls = KVConnectorFactory.get_connector_class(
|
|
self.kv_transfer_config
|
|
)
|
|
if connector_cls.requires_piecewise_for_cudagraph(
|
|
self.kv_transfer_config.kv_connector_extra_config
|
|
):
|
|
logger.warning_once(
|
|
"KV connector %s requires PIECEWISE CUDA graph mode "
|
|
"due to layerwise async operations that cannot be "
|
|
"captured in CUDA graphs. "
|
|
"Overriding cudagraph_mode from %s to PIECEWISE.",
|
|
connector_cls.__name__,
|
|
self.compilation_config.cudagraph_mode.name,
|
|
)
|
|
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
|
|
|
# disable cudagraph when enforce eager execution
|
|
if self.model_config is not None and self.model_config.enforce_eager:
|
|
logger.info("Cudagraph is disabled under eager mode")
|
|
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
|
# override related settings when enforce eager
|
|
self.compilation_config.max_cudagraph_capture_size = 0
|
|
self.compilation_config.cudagraph_capture_sizes = []
|
|
else:
|
|
self.compilation_config.cudagraph_num_of_warmups = 1
|
|
|
|
self._set_cudagraph_sizes()
|
|
else:
|
|
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
|
|
|
if self.cache_config.kv_sharing_fast_prefill:
|
|
if (
|
|
self.speculative_config is not None
|
|
and self.speculative_config.use_eagle()
|
|
):
|
|
raise ValueError(
|
|
"Fast prefill optimization for KV sharing is not "
|
|
"compatible with EAGLE as EAGLE requires correct logits "
|
|
"for all tokens while fast prefill gives incorrect logits "
|
|
"for prompt tokens."
|
|
)
|
|
|
|
logger.warning_once(
|
|
"--kv-sharing-fast-prefill requires changes on model side for "
|
|
"correctness and to realize prefill savings."
|
|
)
|
|
|
|
if (
|
|
self.model_config
|
|
and self.model_config.architecture == "WhisperForConditionalGeneration"
|
|
and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"
|
|
):
|
|
logger.warning(
|
|
"Whisper is known to have issues with "
|
|
"forked workers. If startup is hanging, "
|
|
"try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
|
|
"to 'spawn'."
|
|
)
|
|
|
|
if (
|
|
self.kv_events_config is not None
|
|
and self.kv_events_config.enable_kv_cache_events
|
|
and not self.cache_config.enable_prefix_caching
|
|
):
|
|
logger.warning(
|
|
"KV cache events are on, but prefix caching is not enabled. "
|
|
"Use --enable-prefix-caching to enable."
|
|
)
|
|
if (
|
|
self.kv_events_config is not None
|
|
and self.kv_events_config.publisher != "null"
|
|
and not self.kv_events_config.enable_kv_cache_events
|
|
):
|
|
logger.warning(
|
|
"KV cache events are disabled, "
|
|
"but the scheduler is configured to publish them. "
|
|
"Modify KVEventsConfig.enable_kv_cache_events "
|
|
"to True to enable."
|
|
)
|
|
current_platform.check_and_update_config(self)
|
|
|
|
# Re-compute compile ranges after platform-specific config updates
|
|
# (e.g., XPU may lower max_num_batched_tokens when MLA is enabled)
|
|
self._set_compile_ranges()
|
|
|
|
# Do this after all the updates to compilation_config.mode
|
|
effective_dp_size = (
|
|
self.parallel_config.data_parallel_size
|
|
if self.model_config is None or self.model_config.is_moe
|
|
else 1
|
|
)
|
|
self.compilation_config.set_splitting_ops_for_v1(
|
|
all2all_backend=self.parallel_config.all2all_backend,
|
|
data_parallel_size=effective_dp_size,
|
|
)
|
|
|
|
if self.compilation_config.pass_config.enable_sp:
|
|
# With pipeline parallelism or dynamo partitioning,
|
|
# native rms norm tracing errors due to incorrect residual shape.
|
|
# Use custom rms norm to unblock. In the future,
|
|
# the pass will operate on higher-level IR to avoid the issue.
|
|
# TODO: https://github.com/vllm-project/vllm/issues/27894
|
|
if self.compilation_config.mode != CompilationMode.VLLM_COMPILE:
|
|
logger.warning(
|
|
"Sequence parallelism is enabled, but running in wrong "
|
|
"vllm compile mode: %s.",
|
|
self.compilation_config.mode,
|
|
)
|
|
|
|
is_fullgraph = (
|
|
self.compilation_config.use_inductor_graph_partition
|
|
or len(self.compilation_config.splitting_ops or []) == 0
|
|
)
|
|
if self.parallel_config.pipeline_parallel_size > 1 or not is_fullgraph:
|
|
if "-rms_norm" not in self.compilation_config.custom_ops:
|
|
self.compilation_config.custom_ops.append("+rms_norm")
|
|
else:
|
|
regime = (
|
|
"Dynamo partition"
|
|
if not is_fullgraph
|
|
else "pipeline parallelism"
|
|
)
|
|
logger.warning_once(
|
|
"Sequence parallelism not supported with "
|
|
"native rms_norm when using %s, "
|
|
"this will likely lead to an error.",
|
|
regime,
|
|
)
|
|
|
|
# final check of cudagraph mode after all possible updates
|
|
if current_platform.is_cuda_alike():
|
|
if (
|
|
self.compilation_config.cudagraph_mode.has_full_cudagraphs()
|
|
and self.model_config is not None
|
|
and not self.model_config.disable_cascade_attn
|
|
and not self.compilation_config.cudagraph_mode.has_piecewise_cudagraphs() # noqa: E501
|
|
):
|
|
logger.warning_once(
|
|
"No piecewise cudagraph for executing cascade attention."
|
|
" Will fall back to eager execution if a batch runs "
|
|
"into cascade attentions."
|
|
)
|
|
|
|
if self.compilation_config.cudagraph_mode.requires_piecewise_compilation():
|
|
assert self.compilation_config.mode == CompilationMode.VLLM_COMPILE, (
|
|
"Compilation mode should be CompilationMode.VLLM_COMPILE "
|
|
"when cudagraph_mode piecewise cudagraphs is used, "
|
|
f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
|
|
)
|
|
if (
|
|
self.model_config
|
|
and envs.VLLM_BATCH_INVARIANT
|
|
and not self.model_config.disable_cascade_attn
|
|
):
|
|
self.model_config.disable_cascade_attn = True
|
|
logger.warning_once(
|
|
"Disabling cascade attention when VLLM_BATCH_INVARIANT is enabled.",
|
|
scope="local",
|
|
)
|
|
|
|
if self.parallel_config.use_ubatching:
|
|
a2a_backend = self.parallel_config.all2all_backend
|
|
assert a2a_backend in [
|
|
"deepep_low_latency",
|
|
"deepep_high_throughput",
|
|
], (
|
|
"Microbatching currently only supports the deepep_low_latency and "
|
|
f"deepep_high_throughput all2all backend. {a2a_backend} is not "
|
|
"supported. To fix use --all2all-backend=deepep_low_latency or "
|
|
"--all2all-backend=deepep_high_throughput and install the DeepEP"
|
|
" kernels."
|
|
)
|
|
|
|
if not self.model_config.disable_cascade_attn:
|
|
self.model_config.disable_cascade_attn = True
|
|
logger.warning_once("Disabling cascade attention when DBO is enabled.")
|
|
|
|
if not self.instance_id:
|
|
self.instance_id = random_uuid()[:5]
|
|
|
|
if self.reasoning_config is not None and self.model_config is not None:
|
|
self.reasoning_config.initialize_token_ids(self.model_config)
|
|
|
|
# Hybrid KV cache manager (HMA) runtime rules:
|
|
# - Explicit enable (--no-disable-kv-cache-manager): error if runtime
|
|
# disables it
|
|
# - No preference: auto-disable for unsupported features (e.g. kv connector)
|
|
# - Explicit disable (--disable-kv-cache-manager): always respect it
|
|
need_disable_hybrid_kv_cache_manager = False
|
|
# logger should only print warning message for hybrid models. As we
|
|
# can't know whether the model is hybrid or not now, so we don't log
|
|
# warning message here and will log it later.
|
|
if not current_platform.support_hybrid_kv_cache():
|
|
# Hybrid KV cache manager is not supported on non-GPU platforms.
|
|
need_disable_hybrid_kv_cache_manager = True
|
|
if self.kv_events_config is not None:
|
|
# Hybrid KV cache manager is not compatible with KV events.
|
|
need_disable_hybrid_kv_cache_manager = True
|
|
if (
|
|
self.model_config is not None
|
|
and self.model_config.attention_chunk_size is not None
|
|
):
|
|
if (
|
|
self.speculative_config is not None
|
|
and self.speculative_config.use_eagle()
|
|
):
|
|
# Hybrid KV cache manager is not yet supported with chunked
|
|
# local attention + eagle.
|
|
need_disable_hybrid_kv_cache_manager = True
|
|
elif not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE:
|
|
logger.warning(
|
|
"There is a latency regression when using chunked local"
|
|
" attention with the hybrid KV cache manager. Disabling"
|
|
" it, by default. To enable it, set the environment "
|
|
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1."
|
|
)
|
|
# Hybrid KV cache manager is not yet supported with chunked
|
|
# local attention.
|
|
need_disable_hybrid_kv_cache_manager = True
|
|
|
|
if self.scheduler_config.disable_hybrid_kv_cache_manager is None:
|
|
# Default to disable HMA, but only if the user didn't express a preference.
|
|
if self.kv_transfer_config is not None:
|
|
# NOTE(Kuntai): turn HMA off for connector unless specifically enabled.
|
|
need_disable_hybrid_kv_cache_manager = True
|
|
logger.warning(
|
|
"Turning off hybrid kv cache manager because "
|
|
"`--kv-transfer-config` is set. This will reduce the "
|
|
"performance of vLLM on LLMs with sliding window attention "
|
|
"or Mamba attention. If you are a developer of kv connector"
|
|
", please consider supporting hybrid kv cache manager for "
|
|
"your connector by making sure your connector is a subclass"
|
|
" of `SupportsHMA` defined in kv_connector/v1/base.py and"
|
|
" use --no-disable-hybrid-kv-cache-manager to start vLLM."
|
|
)
|
|
self.scheduler_config.disable_hybrid_kv_cache_manager = (
|
|
need_disable_hybrid_kv_cache_manager
|
|
)
|
|
elif (
|
|
self.scheduler_config.disable_hybrid_kv_cache_manager is False
|
|
and need_disable_hybrid_kv_cache_manager
|
|
):
|
|
raise ValueError(
|
|
"Hybrid KV cache manager was explicitly enabled but is not "
|
|
"supported in this configuration. Consider omitting the "
|
|
"--no-disable-hybrid-kv-cache-manager flag to let vLLM decide"
|
|
" automatically."
|
|
)
|
|
|
|
if self.scheduler_config.disable_hybrid_kv_cache_manager is None:
|
|
# Default to enable HMA if not explicitly disabled by user or logic above.
|
|
self.scheduler_config.disable_hybrid_kv_cache_manager = False
|
|
|
|
if self.compilation_config.debug_dump_path:
|
|
self.compilation_config.debug_dump_path = (
|
|
self.compilation_config.debug_dump_path.absolute().expanduser()
|
|
)
|
|
if envs.VLLM_DEBUG_DUMP_PATH is not None:
|
|
env_path = Path(envs.VLLM_DEBUG_DUMP_PATH).absolute().expanduser()
|
|
if self.compilation_config.debug_dump_path:
|
|
logger.warning(
|
|
"Config-specified debug dump path is overridden"
|
|
" by VLLM_DEBUG_DUMP_PATH to %s",
|
|
env_path,
|
|
)
|
|
self.compilation_config.debug_dump_path = env_path
|
|
|
|
# Enable quant_fp8 CUDA ops (TODO disable in follow up)
|
|
# On H100 the CUDA kernel is faster than
|
|
# native implementation
|
|
# https://github.com/vllm-project/vllm/issues/25094
|
|
if has_blocked_weights():
|
|
custom_ops = self.compilation_config.custom_ops
|
|
if "-quant_fp8" not in custom_ops:
|
|
custom_ops.append("+quant_fp8")
|
|
|
|
# Handle the KV connector configs
|
|
self._post_init_kv_transfer_config()
|
|
|
|
# Log the custom passes that are enabled
|
|
self.compilation_config.pass_config.log_enabled_passes()
|
|
|
|
def update_sizes_for_sequence_parallelism(self, possible_sizes: list) -> list:
|
|
# remove the sizes that not multiple of tp_size when
|
|
# enable sequence parallelism
|
|
removed_sizes = [
|
|
size
|
|
for size in possible_sizes
|
|
if size % self.parallel_config.tensor_parallel_size != 0
|
|
]
|
|
if removed_sizes:
|
|
logger.warning(
|
|
"Batch sizes %s are removed because they are not "
|
|
"multiple of tp_size %d when "
|
|
"sequence parallelism is enabled",
|
|
removed_sizes,
|
|
self.parallel_config.tensor_parallel_size,
|
|
)
|
|
|
|
return [
|
|
size
|
|
for size in possible_sizes
|
|
if size % self.parallel_config.tensor_parallel_size == 0
|
|
]
|
|
|
|
def _set_max_num_scheduled_tokens(self):
|
|
"""
|
|
In most cases, the scheduler may schedule a batch with as many tokens as the
|
|
worker is configured to handle. However for some speculative decoding methods,
|
|
the drafter model may insert additional slots into the batch when drafting.
|
|
To account for this, we need to decrease the max_num_scheduled_tokens by an
|
|
upper bound on the number of slots that can be added.
|
|
"""
|
|
if self.speculative_config is not None:
|
|
scheduled_token_delta = (
|
|
self.speculative_config.max_num_new_slots_for_drafting
|
|
* self.scheduler_config.max_num_seqs
|
|
)
|
|
max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
|
|
if self.scheduler_config.max_num_scheduled_tokens is None:
|
|
self.scheduler_config.max_num_scheduled_tokens = (
|
|
max_num_batched_tokens - scheduled_token_delta
|
|
)
|
|
|
|
if self.scheduler_config.max_num_scheduled_tokens <= 0:
|
|
raise ValueError(
|
|
"max_num_scheduled_tokens is set to"
|
|
f" {self.scheduler_config.max_num_scheduled_tokens} based on"
|
|
" the speculative decoding settings, which does not allow"
|
|
" any tokens to be scheduled. Increase max_num_batched_tokens"
|
|
" to accommodate the additional draft token slots, or decrease"
|
|
" num_speculative_tokens or max_num_seqs."
|
|
)
|
|
if self.scheduler_config.max_num_scheduled_tokens < 8192:
|
|
logger.warning_once(
|
|
"max_num_scheduled_tokens is set to"
|
|
f" {self.scheduler_config.max_num_scheduled_tokens} based on"
|
|
" the speculative decoding settings. This may lead to suboptimal"
|
|
" performance. Consider increasing max_num_batched_tokens to"
|
|
" accommodate the additional draft token slots, or decrease"
|
|
" num_speculative_tokens or max_num_seqs.",
|
|
scope="local",
|
|
)
|
|
|
|
max_num_scheduled_tokens = self.scheduler_config.max_num_scheduled_tokens
|
|
if max_num_batched_tokens < max_num_scheduled_tokens + (
|
|
self.speculative_config.max_num_new_slots_for_drafting
|
|
* self.scheduler_config.max_num_seqs
|
|
):
|
|
raise ValueError(
|
|
f"VllmConfig received max_num_scheduled_tokens but it does not have"
|
|
" enough slots to support the speculative decoding settings."
|
|
f" It should be greater by at least {scheduled_token_delta}, but"
|
|
f" got {max_num_batched_tokens=} and {max_num_scheduled_tokens=}."
|
|
)
|
|
|
|
def _set_cudagraph_sizes(self):
|
|
"""
|
|
vLLM defines the default candidate list of batch sizes for CUDA graph
|
|
capture as:
|
|
|
|
```python
|
|
max_graph_size = min(max_num_seqs * 2, 512)
|
|
# 1, 2, 4, then multiples of 8 up to 256 and then multiples of 16
|
|
# up to max_graph_size
|
|
cudagraph_capture_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list(
|
|
range(256, max_graph_size + 1, 16))
|
|
|
|
In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
|
|
will be the final sizes to capture cudagraph (in ascending order).
|
|
|
|
These sizes are used to capture and reuse CUDA graphs for
|
|
performance-critical paths (e.g., decoding). Capturing enables
|
|
significantly faster kernel dispatch by avoiding Python overhead. The
|
|
list is then filtered based on `max_num_batched_tokens` (e.g., 8192 on
|
|
most GPUs), which controls the total allowed number of tokens in a
|
|
batch. Since each sequence may have a variable number of tokens, the
|
|
maximum usable batch size will depend on actual sequence lengths.
|
|
|
|
Example:
|
|
With `max_num_batched_tokens = 8192`, and typical sequences
|
|
averaging ~32 tokens, most practical batch sizes fall below 256.
|
|
However, the system will still allow capture sizes up to 512 if
|
|
shape and memory permit.
|
|
|
|
Note:
|
|
If users explicitly specify cudagraph capture sizes in the
|
|
compilation config, those will override this default logic.
|
|
At runtime:
|
|
|
|
- If batch size <= one of the `cudagraph_capture_sizes`, the closest
|
|
padded CUDA graph will be used.
|
|
- If batch size > largest `cudagraph_capture_sizes`, cudagraph will
|
|
not be used.
|
|
"""
|
|
|
|
if (
|
|
self.model_config is not None
|
|
and not self.model_config.enforce_eager
|
|
and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
|
|
):
|
|
# determine the initial max_cudagraph_capture_size
|
|
max_cudagraph_capture_size = (
|
|
self.compilation_config.max_cudagraph_capture_size
|
|
)
|
|
if max_cudagraph_capture_size is None:
|
|
decode_query_len = 1
|
|
if (
|
|
self.speculative_config
|
|
and self.speculative_config.num_speculative_tokens
|
|
):
|
|
decode_query_len += self.speculative_config.num_speculative_tokens
|
|
max_cudagraph_capture_size = min(
|
|
self.scheduler_config.max_num_seqs * decode_query_len * 2, 512
|
|
)
|
|
max_num_tokens = self.scheduler_config.max_num_batched_tokens
|
|
max_cudagraph_capture_size = min(max_num_tokens, max_cudagraph_capture_size)
|
|
|
|
assert max_cudagraph_capture_size >= 1, (
|
|
"Maximum cudagraph size should be greater than or equal to 1 "
|
|
"when using cuda graph."
|
|
)
|
|
|
|
# determine the cudagraph_capture_sizes
|
|
if self.compilation_config.cudagraph_capture_sizes is not None:
|
|
assert len(self.compilation_config.cudagraph_capture_sizes) > 0, (
|
|
"cudagraph_capture_sizes should contain at least one element "
|
|
"when using cuda graph."
|
|
)
|
|
# de-duplicate the sizes provided by the config
|
|
dedup_sizes = list(set(self.compilation_config.cudagraph_capture_sizes))
|
|
cudagraph_capture_sizes = [
|
|
i for i in dedup_sizes if i <= max_num_tokens
|
|
]
|
|
# sort to make sure the sizes are in ascending order
|
|
cudagraph_capture_sizes.sort()
|
|
else:
|
|
if self.performance_mode == "interactivity":
|
|
# Fine-grained CUDA graphs at small batch sizes
|
|
# for minimal padding overhead
|
|
interactivity_max = min(max_cudagraph_capture_size, 32)
|
|
cudagraph_capture_sizes = list(range(1, interactivity_max + 1))
|
|
else:
|
|
cudagraph_capture_sizes = [
|
|
i for i in [1, 2, 4] if i <= max_cudagraph_capture_size
|
|
]
|
|
if max_cudagraph_capture_size >= 8:
|
|
# Step size 8 for small batch sizes, up to 256(not included)
|
|
cudagraph_capture_sizes += list(
|
|
range(8, min(max_cudagraph_capture_size + 1, 256), 8)
|
|
)
|
|
if max_cudagraph_capture_size >= 256:
|
|
# Step size 16 for larger batch sizes
|
|
cudagraph_capture_sizes += list(
|
|
range(256, max_cudagraph_capture_size + 1, 16)
|
|
)
|
|
# de-duplicate and sort the sizes
|
|
cudagraph_capture_sizes = sorted(set(cudagraph_capture_sizes))
|
|
|
|
if (
|
|
self.parallel_config.tensor_parallel_size > 1
|
|
and self.compilation_config.pass_config.enable_sp
|
|
):
|
|
cudagraph_capture_sizes = self.update_sizes_for_sequence_parallelism(
|
|
cudagraph_capture_sizes
|
|
)
|
|
|
|
# user-specific compilation_config.max_cudagraph_capture_size get
|
|
# truncated to valid_max_size when they are inconsistent.
|
|
valid_max_size = (
|
|
cudagraph_capture_sizes[-1] if cudagraph_capture_sizes else 0
|
|
)
|
|
if (
|
|
self.compilation_config.max_cudagraph_capture_size is not None
|
|
and self.compilation_config.max_cudagraph_capture_size != valid_max_size
|
|
):
|
|
# raise error only when both two flags are user-specified
|
|
# and they are inconsistent with each other
|
|
if self.compilation_config.cudagraph_capture_sizes is not None:
|
|
raise ValueError(
|
|
"customized max_cudagraph_capture_size"
|
|
f"(={self.compilation_config.max_cudagraph_capture_size}) "
|
|
"should be consistent with the max value of "
|
|
f"cudagraph_capture_sizes(={valid_max_size})"
|
|
)
|
|
|
|
logger.warning(
|
|
"Truncating max_cudagraph_capture_size to %d",
|
|
valid_max_size,
|
|
)
|
|
# always set the final max_cudagraph_capture_size
|
|
self.compilation_config.max_cudagraph_capture_size = valid_max_size
|
|
|
|
if self.compilation_config.cudagraph_capture_sizes is not None and len(
|
|
cudagraph_capture_sizes
|
|
) < len(self.compilation_config.cudagraph_capture_sizes):
|
|
# If users have specified capture sizes, we only need to
|
|
# compare the lens before and after modification since the modified
|
|
# list is only the subset of the original list.
|
|
logger.warning(
|
|
(
|
|
"cudagraph_capture_sizes specified in compilation_config"
|
|
" %s is overridden by config %s"
|
|
),
|
|
self.compilation_config.cudagraph_capture_sizes,
|
|
cudagraph_capture_sizes,
|
|
)
|
|
# always write back the final sizes
|
|
self.compilation_config.cudagraph_capture_sizes = cudagraph_capture_sizes
|
|
|
|
else:
|
|
# no cudagraph in use
|
|
self.compilation_config.max_cudagraph_capture_size = 0
|
|
self.compilation_config.cudagraph_capture_sizes = []
|
|
|
|
# complete the remaining process.
|
|
self.compilation_config.post_init_cudagraph_sizes()
|
|
|
|
def _set_compile_ranges(self):
|
|
"""
|
|
Set the compile ranges for the compilation config.
|
|
"""
|
|
compilation_config = self.compilation_config
|
|
computed_compile_ranges_endpoints = []
|
|
|
|
# The upper bound of the compile ranges is the max_num_batched_tokens.
|
|
compile_range_end = self.scheduler_config.max_num_batched_tokens
|
|
if compile_range_end is not None:
|
|
computed_compile_ranges_endpoints.append(compile_range_end)
|
|
|
|
# Add the compile ranges for flashinfer
|
|
if compilation_config.pass_config.fuse_allreduce_rms:
|
|
tp_size = self.parallel_config.tensor_parallel_size
|
|
max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
|
|
if max_size is not None:
|
|
assert isinstance(self.model_config.dtype, torch.dtype)
|
|
max_token_num = max_size // (
|
|
self.model_config.get_hidden_size()
|
|
* self.model_config.dtype.itemsize
|
|
)
|
|
if compile_range_end is not None and max_token_num < compile_range_end:
|
|
computed_compile_ranges_endpoints.append(max_token_num)
|
|
else:
|
|
logger.debug(
|
|
"Max num batched tokens below allreduce-rms fusion threshold, "
|
|
"allreduce-rms fusion will be enabled for all num_tokens."
|
|
)
|
|
|
|
# Add the compile ranges for sequence parallelism
|
|
if compilation_config.pass_config.enable_sp:
|
|
pass_config = compilation_config.pass_config
|
|
|
|
# Calculate min_token_num if not explicitly provided
|
|
# User override works regardless of hidden_size
|
|
if pass_config.sp_min_token_num is None:
|
|
from vllm.compilation.passes.fusion.sequence_parallelism import (
|
|
get_sequence_parallelism_threshold,
|
|
)
|
|
|
|
tp_size = self.parallel_config.tensor_parallel_size
|
|
hidden_size = self.model_config.get_hidden_size()
|
|
assert isinstance(self.model_config.dtype, torch.dtype)
|
|
element_size = self.model_config.dtype.itemsize
|
|
pass_config.sp_min_token_num = get_sequence_parallelism_threshold(
|
|
hidden_size, tp_size, element_size
|
|
)
|
|
|
|
min_token_num = pass_config.sp_min_token_num
|
|
max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
|
|
if min_token_num is not None and (
|
|
max_num_batched_tokens is not None
|
|
and min_token_num < max_num_batched_tokens
|
|
and min_token_num > 1
|
|
):
|
|
# Add endpoint at min_token_num - 1 to ensure SP applies
|
|
# starting from min_token_num
|
|
# This creates ranges: [1, min-1] (no SP), [min, max] (SP applies)
|
|
computed_compile_ranges_endpoints.append(min_token_num - 1)
|
|
|
|
if compilation_config.pass_config.fuse_rope_kvcache:
|
|
max_token_num = (
|
|
compilation_config.pass_config.rope_kvcache_fusion_max_token_num
|
|
)
|
|
if max_token_num is not None:
|
|
if compile_range_end is not None and max_token_num < compile_range_end:
|
|
computed_compile_ranges_endpoints.append(max_token_num)
|
|
else:
|
|
logger.debug(
|
|
"Max num batched tokens below rope+kvcache fusion threshold, "
|
|
"rope+kvcache fusion enabled for num_tokens <= %d.",
|
|
compile_range_end,
|
|
)
|
|
|
|
if compilation_config.compile_ranges_endpoints is not None:
|
|
for x in compilation_config.compile_ranges_endpoints:
|
|
assert isinstance(x, int)
|
|
assert x > 0, f"Invalid compile range endpoint: {x}"
|
|
if compile_range_end is not None and x < compile_range_end and x > 1:
|
|
computed_compile_ranges_endpoints.append(x)
|
|
compilation_config.compile_ranges_endpoints = sorted(
|
|
computed_compile_ranges_endpoints
|
|
)
|
|
|
|
def try_verify_and_update_config(self):
|
|
if self.model_config is None:
|
|
return
|
|
|
|
# Avoid running try_verify_and_update_config multiple times
|
|
if getattr(self.model_config, "config_updated", False):
|
|
return
|
|
self.model_config.config_updated = True
|
|
|
|
architecture = self.model_config.architecture
|
|
if architecture is None:
|
|
return
|
|
|
|
from vllm.model_executor.models.config import (
|
|
MODELS_CONFIG_MAP,
|
|
HybridAttentionMambaModelConfig,
|
|
)
|
|
|
|
cls = MODELS_CONFIG_MAP.get(architecture, None)
|
|
if cls is not None:
|
|
cls.verify_and_update_config(self)
|
|
|
|
if self.model_config.is_hybrid:
|
|
HybridAttentionMambaModelConfig.verify_and_update_config(self)
|
|
|
|
if self.model_config.convert_type == "classify":
|
|
# Maybe convert ForCausalLM into ForSequenceClassification model.
|
|
from vllm.model_executor.models.adapters import SequenceClassificationConfig
|
|
|
|
SequenceClassificationConfig.verify_and_update_config(self)
|
|
|
|
if hasattr(self.model_config, "model_weights") and is_runai_obj_uri(
|
|
self.model_config.model_weights
|
|
):
|
|
if self.load_config.load_format == "auto":
|
|
logger.info(
|
|
"Detected Run:ai model config. "
|
|
"Overriding `load_format` to 'runai_streamer'"
|
|
)
|
|
self.load_config.load_format = "runai_streamer"
|
|
elif self.load_config.load_format not in (
|
|
"runai_streamer",
|
|
"runai_streamer_sharded",
|
|
):
|
|
raise ValueError(
|
|
f"To load a model from object storage (S3/GCS/Azure), "
|
|
f"'load_format' must be 'runai_streamer' or "
|
|
f"'runai_streamer_sharded', "
|
|
f"but got '{self.load_config.load_format}'. "
|
|
f"Model: {self.model_config.model}"
|
|
)
|
|
|
|
def compile_debug_dump_path(self) -> Path | None:
|
|
"""Returns a rank-aware path for dumping
|
|
torch.compile debug information.
|
|
"""
|
|
if self.compilation_config.debug_dump_path is None:
|
|
return None
|
|
tp_rank = self.parallel_config.rank
|
|
dp_rank = self.parallel_config.data_parallel_index
|
|
append_path = f"rank_{tp_rank}_dp_{dp_rank}"
|
|
path = self.compilation_config.debug_dump_path / append_path
|
|
return path
|
|
|
|
def __str__(self):
|
|
return (
|
|
f"model={self.model_config.model!r}, "
|
|
f"speculative_config={self.speculative_config!r}, "
|
|
f"tokenizer={self.model_config.tokenizer!r}, "
|
|
f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
|
|
f"tokenizer_mode={self.model_config.tokenizer_mode}, "
|
|
f"revision={self.model_config.revision}, "
|
|
f"tokenizer_revision={self.model_config.tokenizer_revision}, "
|
|
f"trust_remote_code={self.model_config.trust_remote_code}, "
|
|
f"dtype={self.model_config.dtype}, "
|
|
f"max_seq_len={self.model_config.max_model_len}, "
|
|
f"download_dir={self.load_config.download_dir!r}, "
|
|
f"load_format={self.load_config.load_format}, "
|
|
f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}, " # noqa
|
|
f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, " # noqa
|
|
f"data_parallel_size={self.parallel_config.data_parallel_size}, " # noqa
|
|
f"decode_context_parallel_size={self.parallel_config.decode_context_parallel_size}, " # noqa
|
|
f"dcp_comm_backend={self.parallel_config.dcp_comm_backend}, " # noqa
|
|
f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, " # noqa
|
|
f"quantization={self.model_config.quantization}, "
|
|
f"enforce_eager={self.model_config.enforce_eager}, "
|
|
f"enable_return_routed_experts={self.model_config.enable_return_routed_experts}, " # noqa
|
|
f"kv_cache_dtype={self.cache_config.cache_dtype}, "
|
|
f"device_config={self.device_config.device}, "
|
|
f"structured_outputs_config={self.structured_outputs_config!r}, "
|
|
f"observability_config={self.observability_config!r}, "
|
|
f"seed={self.model_config.seed}, "
|
|
f"served_model_name={self.model_config.served_model_name}, "
|
|
f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
|
|
f"enable_chunked_prefill={self.scheduler_config.enable_chunked_prefill}, " # noqa
|
|
f"pooler_config={self.model_config.pooler_config!r}, "
|
|
f"compilation_config={self.compilation_config!r}, "
|
|
f"kernel_config={self.kernel_config!r}"
|
|
)
|
|
|
|
def validate_block_size(self) -> None:
|
|
"""Validate block_size against DCP and mamba constraints.
|
|
|
|
Called after Platform.update_block_size_for_backend() has
|
|
finalised block_size.
|
|
"""
|
|
block_size = self.cache_config.block_size
|
|
|
|
# DCP interleave-size compatibility
|
|
if self.parallel_config.decode_context_parallel_size > 1:
|
|
if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
|
|
self.parallel_config.cp_kv_cache_interleave_size
|
|
!= self.parallel_config.dcp_kv_cache_interleave_size
|
|
):
|
|
self.parallel_config.cp_kv_cache_interleave_size = (
|
|
self.parallel_config.dcp_kv_cache_interleave_size
|
|
)
|
|
logger.warning_once(
|
|
"cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
|
|
"_interleave_size. And dcp-kv-cache-interleave-size will be "
|
|
"deprecated when PCP is fully supported."
|
|
)
|
|
assert (
|
|
self.parallel_config.cp_kv_cache_interleave_size <= block_size
|
|
and block_size % self.parallel_config.cp_kv_cache_interleave_size == 0
|
|
), (
|
|
f"Block_size({block_size}) should be greater "
|
|
"than or equal to and divisible by cp_kv_cache_interleave_size "
|
|
f"({self.parallel_config.cp_kv_cache_interleave_size})."
|
|
)
|
|
|
|
# Mamba cache align-mode constraints
|
|
if self.cache_config.mamba_cache_mode == "align":
|
|
assert block_size <= self.scheduler_config.max_num_batched_tokens, (
|
|
"In Mamba cache align mode, block_size "
|
|
f"({block_size}) must be <= "
|
|
"max_num_batched_tokens "
|
|
f"({self.scheduler_config.max_num_batched_tokens})."
|
|
)
|
|
if self.scheduler_config.long_prefill_token_threshold > 0:
|
|
assert self.scheduler_config.long_prefill_token_threshold >= block_size
|
|
assert not self.scheduler_config.disable_chunked_mm_input, (
|
|
"Chunked MM input is required because we need the flexibility "
|
|
"to schedule a multiple of block_size tokens even if they are "
|
|
"in the middle of a mm input"
|
|
)
|
|
|
|
@model_validator(mode="after")
|
|
def validate_mamba_block_size(self) -> "VllmConfig":
|
|
if self.model_config is None:
|
|
return self
|
|
mamba_block_size_is_set = (
|
|
self.cache_config.mamba_block_size is not None
|
|
and self.cache_config.mamba_block_size != self.model_config.max_model_len
|
|
)
|
|
if mamba_block_size_is_set and not self.cache_config.enable_prefix_caching:
|
|
raise ValueError(
|
|
"--mamba-block-size can only be set with --enable-prefix-caching"
|
|
)
|
|
return self
|
|
|
|
|
|
_current_vllm_config: VllmConfig | None = None
|
|
_current_prefix: str | None = None
|
|
|
|
|
|
@contextmanager
|
|
def set_current_vllm_config(
|
|
vllm_config: VllmConfig, check_compile=False, prefix: str | None = None
|
|
):
|
|
"""
|
|
Temporarily set the current vLLM config.
|
|
Used during model initialization.
|
|
We save the current vLLM config in a global variable,
|
|
so that all modules can access it, e.g. custom ops
|
|
can access the vLLM config to determine how to dispatch.
|
|
"""
|
|
global _current_vllm_config, _current_prefix
|
|
old_vllm_config = _current_vllm_config
|
|
old_prefix = _current_prefix
|
|
from vllm.compilation.counter import compilation_counter
|
|
|
|
num_models_seen = compilation_counter.num_models_seen
|
|
try:
|
|
# Clear the compilation config cache when context changes.
|
|
# This is needed since the old config may have been accessed
|
|
# and cached before the new config is set.
|
|
get_cached_compilation_config.cache_clear()
|
|
|
|
_current_vllm_config = vllm_config
|
|
_current_prefix = prefix
|
|
yield
|
|
except Exception:
|
|
raise
|
|
else:
|
|
if check_compile:
|
|
vllm_config.compilation_config.custom_op_log_check()
|
|
|
|
if (
|
|
check_compile
|
|
and vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE
|
|
and compilation_counter.num_models_seen == num_models_seen
|
|
):
|
|
# If the model supports compilation,
|
|
# compilation_counter.num_models_seen should be increased
|
|
# by at least 1.
|
|
# If it is not increased, it means the model does not support
|
|
# compilation (does not have @support_torch_compile decorator).
|
|
logger.warning(
|
|
"`torch.compile` is turned on, but the model %s"
|
|
" does not support it. Please open an issue on GitHub"
|
|
" if you want it to be supported.",
|
|
vllm_config.model_config.model,
|
|
)
|
|
finally:
|
|
_current_vllm_config = old_vllm_config
|
|
_current_prefix = old_prefix
|
|
# Clear the compilation config cache when context changes
|
|
get_cached_compilation_config.cache_clear()
|
|
|
|
|
|
@lru_cache(maxsize=1)
|
|
def get_cached_compilation_config():
|
|
"""Cache config to avoid repeated calls to get_current_vllm_config()"""
|
|
return get_current_vllm_config().compilation_config
|
|
|
|
|
|
def get_current_vllm_config() -> VllmConfig:
|
|
if _current_vllm_config is None:
|
|
raise AssertionError(
|
|
"Current vLLM config is not set. This typically means "
|
|
"get_current_vllm_config() was called outside of a "
|
|
"set_current_vllm_config() context, or a CustomOp was instantiated "
|
|
"at module import time or model forward time when config is not set. "
|
|
"For tests that directly test custom ops/modules, use the "
|
|
"'default_vllm_config' pytest fixture from tests/conftest.py."
|
|
)
|
|
return _current_vllm_config
|
|
|
|
|
|
def get_current_vllm_config_or_none() -> VllmConfig | None:
|
|
return _current_vllm_config
|
|
|
|
|
|
T = TypeVar("T")
|
|
|
|
|
|
def get_layers_from_vllm_config(
|
|
vllm_config: VllmConfig,
|
|
layer_type: type[T],
|
|
layer_names: list[str] | None = None,
|
|
) -> dict[str, T]:
|
|
"""
|
|
Get layers from the vLLM config.
|
|
|
|
Args:
|
|
vllm_config: The vLLM config.
|
|
layer_type: The type of the layer to get.
|
|
layer_names: The names of the layers to get. If None, return all layers.
|
|
"""
|
|
|
|
if layer_names is None:
|
|
layer_names = list(vllm_config.compilation_config.static_forward_context.keys())
|
|
|
|
forward_context = vllm_config.compilation_config.static_forward_context
|
|
|
|
return {
|
|
layer_name: forward_context[layer_name]
|
|
for layer_name in layer_names
|
|
if layer_name in forward_context
|
|
and isinstance(forward_context[layer_name], layer_type)
|
|
}
|