Signed-off-by: morrison-turnansky <mturnans@redhat.com> Signed-off-by: adabeyta <aabeyta@redhat.com> Signed-off-by: Morrison Turnansky <mturnans@redhat.com> Co-authored-by: adabeyta <aabeyta@redhat.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
00d3310d2d
commit
0838b52e2e
@@ -8,7 +8,7 @@ from dataclasses import asdict, field
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, ClassVar, Literal
|
||||
|
||||
from pydantic import TypeAdapter, field_validator
|
||||
from pydantic import Field, TypeAdapter, field_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
import vllm.envs as envs
|
||||
@@ -97,19 +97,25 @@ class PassConfig:
|
||||
|
||||
This is separate from general `CompilationConfig` so that inductor passes
|
||||
don't all have access to full configuration - that would create a cycle as
|
||||
the `PassManager` is set as a property of config."""
|
||||
the `PassManager` is set as a property of config.
|
||||
|
||||
enable_fusion: bool = False
|
||||
You must pass PassConfig to VLLMConfig constructor via the CompilationConfig
|
||||
constructor. VLLMConfig's post_init does further initialization.
|
||||
If used outside of the VLLMConfig, some fields may be left in an
|
||||
improper state.
|
||||
"""
|
||||
|
||||
enable_fusion: bool = Field(default=None)
|
||||
"""Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
|
||||
enable_attn_fusion: bool = False
|
||||
enable_attn_fusion: bool = Field(default=None)
|
||||
"""Whether to enable the custom attention+quant fusion pass."""
|
||||
enable_noop: bool = False
|
||||
enable_noop: bool = Field(default=None)
|
||||
"""Whether to enable the custom no-op elimination pass."""
|
||||
enable_sequence_parallelism: bool = False
|
||||
enable_sequence_parallelism: bool = Field(default=None)
|
||||
"""Whether to enable sequence parallelism."""
|
||||
enable_async_tp: bool = False
|
||||
enable_async_tp: bool = Field(default=None)
|
||||
"""Whether to enable async TP."""
|
||||
enable_fi_allreduce_fusion: bool = False
|
||||
enable_fi_allreduce_fusion: bool = Field(default=None)
|
||||
"""Whether to enable flashinfer allreduce fusion."""
|
||||
fi_allreduce_fusion_max_size_mb: float | None = None
|
||||
"""The threshold of the communicated tensor sizes under which
|
||||
@@ -167,6 +173,22 @@ class PassConfig:
|
||||
"""
|
||||
return InductorPass.hash_dict(asdict(self))
|
||||
|
||||
@field_validator(
|
||||
"enable_fusion",
|
||||
"enable_attn_fusion",
|
||||
"enable_noop",
|
||||
"enable_sequence_parallelism",
|
||||
"enable_async_tp",
|
||||
"enable_fi_allreduce_fusion",
|
||||
mode="wrap",
|
||||
)
|
||||
@classmethod
|
||||
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
|
||||
"""Skip validation if the value is `None` when initialisation is delayed."""
|
||||
if value is None:
|
||||
return value
|
||||
return handler(value)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if not self.enable_noop:
|
||||
if self.enable_fusion:
|
||||
@@ -243,7 +265,13 @@ class DynamicShapesConfig:
|
||||
@config
|
||||
@dataclass
|
||||
class CompilationConfig:
|
||||
"""Configuration for compilation. It has three parts:
|
||||
"""Configuration for compilation.
|
||||
|
||||
You must pass CompilationConfig to VLLMConfig constructor.
|
||||
VLLMConfig's post_init does further initialization. If used outside of the
|
||||
VLLMConfig, some fields will be left in an improper state.
|
||||
|
||||
It has three parts:
|
||||
|
||||
- Top-level Compilation control:
|
||||
- [`mode`][vllm.config.CompilationConfig.mode]
|
||||
@@ -282,14 +310,14 @@ class CompilationConfig:
|
||||
"""
|
||||
|
||||
# Top-level Compilation control
|
||||
level: int | None = None
|
||||
level: int = Field(default=None)
|
||||
"""
|
||||
Level is deprecated and will be removed in the next release,
|
||||
either 0.12.0 or 0.11.2 whichever is soonest.
|
||||
Please use mode. Currently all levels are mapped to mode.
|
||||
"""
|
||||
# Top-level Compilation control
|
||||
mode: CompilationMode | None = None
|
||||
mode: CompilationMode = Field(default=None)
|
||||
"""The compilation approach used for torch.compile-based compilation of the
|
||||
model.
|
||||
|
||||
@@ -390,7 +418,7 @@ class CompilationConfig:
|
||||
constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
|
||||
|
||||
# CudaGraph compilation
|
||||
cudagraph_mode: CUDAGraphMode | None = None
|
||||
cudagraph_mode: CUDAGraphMode = Field(default=None)
|
||||
"""
|
||||
The mode of the cudagraph:
|
||||
|
||||
@@ -452,7 +480,7 @@ class CompilationConfig:
|
||||
When `enable_lora` is False, this option has no effect.
|
||||
"""
|
||||
|
||||
use_inductor_graph_partition: bool = False
|
||||
use_inductor_graph_partition: bool = Field(default=None)
|
||||
"""Use inductor graph partition to split the graph at cudagraph_unsafe ops.
|
||||
This partition happens at inductor codegen time after all passes and fusions
|
||||
are finished. It generates a single `call` function which wraps
|
||||
@@ -648,6 +676,20 @@ class CompilationConfig:
|
||||
)
|
||||
return value
|
||||
|
||||
@field_validator(
|
||||
"level",
|
||||
"mode",
|
||||
"cudagraph_mode",
|
||||
"use_inductor_graph_partition",
|
||||
mode="wrap",
|
||||
)
|
||||
@classmethod
|
||||
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
|
||||
"""Skip validation if the value is `None` when initialisation is delayed."""
|
||||
if value is None:
|
||||
return value
|
||||
return handler(value)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if self.level is not None:
|
||||
logger.warning(
|
||||
@@ -948,6 +990,13 @@ class CompilationConfig:
|
||||
op,
|
||||
)
|
||||
|
||||
def is_custom_op_enabled(self, op: str) -> bool:
|
||||
if "all" in self.custom_ops:
|
||||
return f"-{op}" not in self.custom_ops
|
||||
|
||||
assert "none" in self.custom_ops
|
||||
return f"+{op}" in self.custom_ops
|
||||
|
||||
def adjust_cudagraph_sizes_for_spec_decode(
|
||||
self, uniform_decode_query_len: int, tensor_parallel_size: int
|
||||
):
|
||||
|
||||
Reference in New Issue
Block a user