Signed-off-by: morrison-turnansky <mturnans@redhat.com> Signed-off-by: adabeyta <aabeyta@redhat.com> Signed-off-by: Morrison Turnansky <mturnans@redhat.com> Co-authored-by: adabeyta <aabeyta@redhat.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
00d3310d2d
commit
0838b52e2e
@@ -8,7 +8,7 @@ from dataclasses import asdict, field
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, ClassVar, Literal
|
||||
|
||||
from pydantic import TypeAdapter, field_validator
|
||||
from pydantic import Field, TypeAdapter, field_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
import vllm.envs as envs
|
||||
@@ -97,19 +97,25 @@ class PassConfig:
|
||||
|
||||
This is separate from general `CompilationConfig` so that inductor passes
|
||||
don't all have access to full configuration - that would create a cycle as
|
||||
the `PassManager` is set as a property of config."""
|
||||
the `PassManager` is set as a property of config.
|
||||
|
||||
enable_fusion: bool = False
|
||||
You must pass PassConfig to VLLMConfig constructor via the CompilationConfig
|
||||
constructor. VLLMConfig's post_init does further initialization.
|
||||
If used outside of the VLLMConfig, some fields may be left in an
|
||||
improper state.
|
||||
"""
|
||||
|
||||
enable_fusion: bool = Field(default=None)
|
||||
"""Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
|
||||
enable_attn_fusion: bool = False
|
||||
enable_attn_fusion: bool = Field(default=None)
|
||||
"""Whether to enable the custom attention+quant fusion pass."""
|
||||
enable_noop: bool = False
|
||||
enable_noop: bool = Field(default=None)
|
||||
"""Whether to enable the custom no-op elimination pass."""
|
||||
enable_sequence_parallelism: bool = False
|
||||
enable_sequence_parallelism: bool = Field(default=None)
|
||||
"""Whether to enable sequence parallelism."""
|
||||
enable_async_tp: bool = False
|
||||
enable_async_tp: bool = Field(default=None)
|
||||
"""Whether to enable async TP."""
|
||||
enable_fi_allreduce_fusion: bool = False
|
||||
enable_fi_allreduce_fusion: bool = Field(default=None)
|
||||
"""Whether to enable flashinfer allreduce fusion."""
|
||||
fi_allreduce_fusion_max_size_mb: float | None = None
|
||||
"""The threshold of the communicated tensor sizes under which
|
||||
@@ -167,6 +173,22 @@ class PassConfig:
|
||||
"""
|
||||
return InductorPass.hash_dict(asdict(self))
|
||||
|
||||
@field_validator(
|
||||
"enable_fusion",
|
||||
"enable_attn_fusion",
|
||||
"enable_noop",
|
||||
"enable_sequence_parallelism",
|
||||
"enable_async_tp",
|
||||
"enable_fi_allreduce_fusion",
|
||||
mode="wrap",
|
||||
)
|
||||
@classmethod
|
||||
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
|
||||
"""Skip validation if the value is `None` when initialisation is delayed."""
|
||||
if value is None:
|
||||
return value
|
||||
return handler(value)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if not self.enable_noop:
|
||||
if self.enable_fusion:
|
||||
@@ -243,7 +265,13 @@ class DynamicShapesConfig:
|
||||
@config
|
||||
@dataclass
|
||||
class CompilationConfig:
|
||||
"""Configuration for compilation. It has three parts:
|
||||
"""Configuration for compilation.
|
||||
|
||||
You must pass CompilationConfig to VLLMConfig constructor.
|
||||
VLLMConfig's post_init does further initialization. If used outside of the
|
||||
VLLMConfig, some fields will be left in an improper state.
|
||||
|
||||
It has three parts:
|
||||
|
||||
- Top-level Compilation control:
|
||||
- [`mode`][vllm.config.CompilationConfig.mode]
|
||||
@@ -282,14 +310,14 @@ class CompilationConfig:
|
||||
"""
|
||||
|
||||
# Top-level Compilation control
|
||||
level: int | None = None
|
||||
level: int = Field(default=None)
|
||||
"""
|
||||
Level is deprecated and will be removed in the next release,
|
||||
either 0.12.0 or 0.11.2 whichever is soonest.
|
||||
Please use mode. Currently all levels are mapped to mode.
|
||||
"""
|
||||
# Top-level Compilation control
|
||||
mode: CompilationMode | None = None
|
||||
mode: CompilationMode = Field(default=None)
|
||||
"""The compilation approach used for torch.compile-based compilation of the
|
||||
model.
|
||||
|
||||
@@ -390,7 +418,7 @@ class CompilationConfig:
|
||||
constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
|
||||
|
||||
# CudaGraph compilation
|
||||
cudagraph_mode: CUDAGraphMode | None = None
|
||||
cudagraph_mode: CUDAGraphMode = Field(default=None)
|
||||
"""
|
||||
The mode of the cudagraph:
|
||||
|
||||
@@ -452,7 +480,7 @@ class CompilationConfig:
|
||||
When `enable_lora` is False, this option has no effect.
|
||||
"""
|
||||
|
||||
use_inductor_graph_partition: bool = False
|
||||
use_inductor_graph_partition: bool = Field(default=None)
|
||||
"""Use inductor graph partition to split the graph at cudagraph_unsafe ops.
|
||||
This partition happens at inductor codegen time after all passes and fusions
|
||||
are finished. It generates a single `call` function which wraps
|
||||
@@ -648,6 +676,20 @@ class CompilationConfig:
|
||||
)
|
||||
return value
|
||||
|
||||
@field_validator(
|
||||
"level",
|
||||
"mode",
|
||||
"cudagraph_mode",
|
||||
"use_inductor_graph_partition",
|
||||
mode="wrap",
|
||||
)
|
||||
@classmethod
|
||||
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
|
||||
"""Skip validation if the value is `None` when initialisation is delayed."""
|
||||
if value is None:
|
||||
return value
|
||||
return handler(value)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if self.level is not None:
|
||||
logger.warning(
|
||||
@@ -948,6 +990,13 @@ class CompilationConfig:
|
||||
op,
|
||||
)
|
||||
|
||||
def is_custom_op_enabled(self, op: str) -> bool:
|
||||
if "all" in self.custom_ops:
|
||||
return f"-{op}" not in self.custom_ops
|
||||
|
||||
assert "none" in self.custom_ops
|
||||
return f"+{op}" in self.custom_ops
|
||||
|
||||
def adjust_cudagraph_sizes_for_spec_decode(
|
||||
self, uniform_decode_query_len: int, tensor_parallel_size: int
|
||||
):
|
||||
|
||||
@@ -1752,6 +1752,14 @@ class ModelConfig:
|
||||
logger.info("Using max model len %s", max_model_len)
|
||||
return max_model_len
|
||||
|
||||
def is_model_moe(
|
||||
self,
|
||||
) -> bool:
|
||||
return self.get_num_experts() > 1
|
||||
|
||||
def is_quantized(self) -> bool:
|
||||
return getattr(self.hf_config, "quantization_config", None) is not None
|
||||
|
||||
|
||||
def get_served_model_name(model: str, served_model_name: str | list[str] | None):
|
||||
"""
|
||||
|
||||
@@ -9,8 +9,9 @@ import tempfile
|
||||
import threading
|
||||
import time
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import replace
|
||||
from dataclasses import is_dataclass, replace
|
||||
from datetime import datetime
|
||||
from enum import IntEnum
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, TypeVar, get_args
|
||||
@@ -57,6 +58,103 @@ else:
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class OptimizationLevel(IntEnum):
|
||||
"""Optimization level enum."""
|
||||
|
||||
O0 = 0
|
||||
"""O0 : No optimization. no compilation, no cudagraphs, no other
|
||||
optimization, just starting up immediately"""
|
||||
O1 = 1
|
||||
"""O1: Quick optimizations. Dynamo+Inductor compilation and Piecewise
|
||||
cudagraphs"""
|
||||
O2 = 2
|
||||
"""O2: Full optimizations. -O1 as well as Full and Piecewise cudagraphs."""
|
||||
O3 = 3
|
||||
"""O3: Currently the same as -O2s."""
|
||||
|
||||
|
||||
IS_QUANTIZED = False
|
||||
IS_DENSE = False
|
||||
# The optimizations that depend on these properties currently set to False
|
||||
# in all cases.
|
||||
# if model_config is not None:
|
||||
# IS_QUANTIZED = lambda c: c.model_config.is_quantized()
|
||||
# IS_DENSE = lambda c: not c.model_config.is_model_moe()
|
||||
# See https://github.com/vllm-project/vllm/issues/25689.
|
||||
|
||||
|
||||
def enable_fusion(cfg: "VllmConfig") -> bool:
|
||||
"""Returns True if RMS norm or quant FP8 is enabled."""
|
||||
return cfg.compilation_config.is_custom_op_enabled(
|
||||
"rms_norm"
|
||||
) or cfg.compilation_config.is_custom_op_enabled("quant_fp8")
|
||||
|
||||
|
||||
OPTIMIZATION_LEVEL_00 = {
|
||||
"compilation_config": {
|
||||
"pass_config": {
|
||||
"enable_noop": False,
|
||||
"enable_fusion": False,
|
||||
"enable_fi_allreduce_fusion": False,
|
||||
"enable_attn_fusion": False,
|
||||
"enable_sequence_parallelism": False,
|
||||
"enable_async_tp": False,
|
||||
},
|
||||
"cudagraph_mode": CUDAGraphMode.NONE,
|
||||
"use_inductor_graph_partition": False,
|
||||
},
|
||||
}
|
||||
OPTIMIZATION_LEVEL_01 = {
|
||||
"compilation_config": {
|
||||
"pass_config": {
|
||||
"enable_noop": True,
|
||||
"enable_fusion": enable_fusion,
|
||||
"enable_fi_allreduce_fusion": False,
|
||||
"enable_attn_fusion": False,
|
||||
"enable_sequence_parallelism": False,
|
||||
"enable_async_tp": False,
|
||||
},
|
||||
"cudagraph_mode": CUDAGraphMode.PIECEWISE,
|
||||
"use_inductor_graph_partition": False,
|
||||
},
|
||||
}
|
||||
OPTIMIZATION_LEVEL_02 = {
|
||||
"compilation_config": {
|
||||
"pass_config": {
|
||||
"enable_noop": True,
|
||||
"enable_fusion": enable_fusion,
|
||||
"enable_fi_allreduce_fusion": False,
|
||||
"enable_attn_fusion": IS_QUANTIZED,
|
||||
"enable_sequence_parallelism": IS_DENSE,
|
||||
"enable_async_tp": IS_DENSE,
|
||||
},
|
||||
"cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
|
||||
"use_inductor_graph_partition": False,
|
||||
},
|
||||
}
|
||||
OPTIMIZATION_LEVEL_03 = {
|
||||
"compilation_config": {
|
||||
"pass_config": {
|
||||
"enable_noop": True,
|
||||
"enable_fusion": enable_fusion,
|
||||
"enable_fi_allreduce_fusion": False,
|
||||
"enable_attn_fusion": IS_QUANTIZED,
|
||||
"enable_sequence_parallelism": IS_DENSE,
|
||||
"enable_async_tp": IS_DENSE,
|
||||
},
|
||||
"cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
|
||||
"use_inductor_graph_partition": False,
|
||||
},
|
||||
}
|
||||
|
||||
OPTIMIZATION_LEVEL_TO_CONFIG = {
|
||||
OptimizationLevel.O0: OPTIMIZATION_LEVEL_00,
|
||||
OptimizationLevel.O1: OPTIMIZATION_LEVEL_01,
|
||||
OptimizationLevel.O2: OPTIMIZATION_LEVEL_02,
|
||||
OptimizationLevel.O3: OPTIMIZATION_LEVEL_03,
|
||||
}
|
||||
|
||||
|
||||
@config
|
||||
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
||||
class VllmConfig:
|
||||
@@ -116,6 +214,11 @@ class VllmConfig:
|
||||
you are using. Contents must be hashable."""
|
||||
instance_id: str = ""
|
||||
"""The ID of the vLLM instance."""
|
||||
optimization_level: OptimizationLevel = OptimizationLevel.O2
|
||||
"""The optimization level. These levels trade startup time cost for
|
||||
performance, with -O0 having the best startup time and -O3 having the best
|
||||
performance. -02 is used by defult. See OptimizationLevel for full
|
||||
description."""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
@@ -297,6 +400,50 @@ class VllmConfig:
|
||||
|
||||
return replace(self, model_config=model_config)
|
||||
|
||||
def _set_config_default(self, config_obj: Any, key: str, value: Any) -> None:
|
||||
"""Set config attribute to default if not already set by user.
|
||||
|
||||
Args:
|
||||
config_obj: Configuration object to update.
|
||||
key: Attribute name.
|
||||
value: Default value (static or callable).
|
||||
"""
|
||||
if getattr(config_obj, key) is None:
|
||||
# Some config values are known before initialization and are
|
||||
# hard coded.
|
||||
# Other values depend on the user given configuration, so they are
|
||||
# implemented with lambda functions and decided at run time.
|
||||
setattr(config_obj, key, value(self) if callable(value) else value)
|
||||
|
||||
def _apply_optimization_level_defaults(self, defaults: dict[str, Any]) -> None:
|
||||
"""Apply optimization level defaults using self as root.
|
||||
|
||||
Recursively applies values from defaults into nested config objects.
|
||||
Only fields present in defaults are overwritten.
|
||||
|
||||
If the user configuration does not specify a value for a default field
|
||||
and if the default field is still None after all user selections are
|
||||
applied, then default values will be applied to the field. User speciied
|
||||
fields will not be overridden by the default.
|
||||
|
||||
Args:
|
||||
defaults: Dictionary of default values to apply.
|
||||
"""
|
||||
|
||||
def apply_recursive(config_obj: Any, config_defaults: dict[str, Any]) -> None:
|
||||
"""Recursively apply defaults to config_obj, using self as root."""
|
||||
for key, value in config_defaults.items():
|
||||
if not hasattr(config_obj, key):
|
||||
continue
|
||||
|
||||
current = getattr(config_obj, key)
|
||||
if isinstance(value, dict) and is_dataclass(current):
|
||||
apply_recursive(current, value)
|
||||
else:
|
||||
self._set_config_default(config_obj, key, value)
|
||||
|
||||
apply_recursive(self, defaults)
|
||||
|
||||
def _post_init_kv_transfer_config(self) -> None:
|
||||
"""Update KVTransferConfig based on top-level configs in VllmConfig.
|
||||
|
||||
@@ -434,17 +581,47 @@ class VllmConfig:
|
||||
"precision for chunked prefill triton kernels."
|
||||
)
|
||||
|
||||
# If the user does not explicitly set a compilation mode, then
|
||||
# we use the default mode. The default mode depends on other
|
||||
# settings (see the below code).
|
||||
if (
|
||||
self.optimization_level > OptimizationLevel.O0
|
||||
and self.model_config is not None
|
||||
and self.model_config.enforce_eager
|
||||
):
|
||||
logger.warning("Enforce eager set, overriding optimization level to -O0")
|
||||
self.optimization_level = OptimizationLevel.O0
|
||||
|
||||
if self.compilation_config.backend == "eager" or (
|
||||
self.compilation_config.mode is not None
|
||||
and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
|
||||
):
|
||||
logger.warning(
|
||||
"Inductor compilation was disabled by user settings,"
|
||||
"Optimizations settings that are only active during"
|
||||
"Inductor compilation will be ignored."
|
||||
)
|
||||
|
||||
def has_blocked_weights():
|
||||
if self.quant_config is not None:
|
||||
if hasattr(self.quant_config, "weight_block_size"):
|
||||
return self.quant_config.weight_block_size is not None
|
||||
elif hasattr(self.quant_config, "has_blocked_weights"):
|
||||
return self.quant_config.has_blocked_weights()
|
||||
return False
|
||||
|
||||
# Enable quant_fp8 CUDA ops (TODO disable in follow up)
|
||||
# On H100 the CUDA kernel is faster than
|
||||
# native implementation
|
||||
# https://github.com/vllm-project/vllm/issues/25094
|
||||
if has_blocked_weights():
|
||||
custom_ops = self.compilation_config.custom_ops
|
||||
if "-quant_fp8" not in custom_ops:
|
||||
custom_ops.append("+quant_fp8")
|
||||
|
||||
if self.compilation_config.mode is None:
|
||||
if self.model_config is not None and not self.model_config.enforce_eager:
|
||||
if self.optimization_level > OptimizationLevel.O0:
|
||||
self.compilation_config.mode = CompilationMode.VLLM_COMPILE
|
||||
else:
|
||||
self.compilation_config.mode = CompilationMode.NONE
|
||||
|
||||
# If user does not set custom ops via none or all set it here based on
|
||||
# compilation mode and backend.
|
||||
if all(s not in self.compilation_config.custom_ops for s in ("all", "none")):
|
||||
if (
|
||||
self.compilation_config.backend == "inductor"
|
||||
@@ -454,23 +631,33 @@ class VllmConfig:
|
||||
else:
|
||||
self.compilation_config.custom_ops.append("all")
|
||||
|
||||
default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level]
|
||||
self._apply_optimization_level_defaults(default_config)
|
||||
if (
|
||||
self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
|
||||
and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
|
||||
):
|
||||
logger.info(
|
||||
"Cudagraph mode %s is not compatible with compilation mode %s."
|
||||
"Overriding to NONE.",
|
||||
self.compilation_config.cudagraph_mode,
|
||||
self.compilation_config.mode,
|
||||
)
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
|
||||
# async tp is built on top of sequence parallelism
|
||||
# and requires it to be enabled.
|
||||
if self.compilation_config.pass_config.enable_async_tp:
|
||||
self.compilation_config.pass_config.enable_sequence_parallelism = True
|
||||
if self.compilation_config.pass_config.enable_sequence_parallelism:
|
||||
if "-rms_norm" in self.compilation_config.custom_ops:
|
||||
logger.warning(
|
||||
"RMS norm force disabled, sequence parallelism might break"
|
||||
)
|
||||
else:
|
||||
self.compilation_config.custom_ops.append("+rms_norm")
|
||||
|
||||
if current_platform.support_static_graph_mode():
|
||||
# if cudagraph_mode is not explicitly set by users, set default
|
||||
# value
|
||||
if self.compilation_config.cudagraph_mode is None:
|
||||
if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
|
||||
# default to full and piecewise for most models
|
||||
self.compilation_config.cudagraph_mode = (
|
||||
CUDAGraphMode.FULL_AND_PIECEWISE
|
||||
)
|
||||
else:
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
|
||||
# if cudagraph_mode has full cudagraphs, we need to check support
|
||||
if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
|
||||
# decode context parallel does not support full cudagraphs
|
||||
|
||||
Reference in New Issue
Block a user