[V1] V1 Enablement Oracle (#13726)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
This commit is contained in:
Robert Shaw
2025-03-15 01:02:20 -04:00
committed by GitHub
parent 8c0d15d5c5
commit d4d93db2c5
96 changed files with 1537 additions and 512 deletions

View File

@@ -1140,6 +1140,10 @@ class CacheConfig:
if self.cache_dtype == "auto":
pass
elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2"):
if envs.VLLM_USE_V1:
raise NotImplementedError(
"V1 does not yet support fp8 KV cache. "
"Set VLLM_USE_V1=0 to enable fp8 kv cache.")
logger.info(
"Using fp8 data type to store kv cache. It reduces the GPU "
"memory footprint and boosts the performance. "
@@ -3142,16 +3146,7 @@ class CompilationConfig(BaseModel):
self.inductor_compile_config[KEY] = False
if self.splitting_ops is None:
if envs.VLLM_USE_V1:
# v1 must split the graph on attention ops
# for piecewise cudagraph
self.splitting_ops = [
"vllm.unified_attention",
"vllm.unified_attention_with_output",
]
else:
# v0 uses full graph compilation
self.splitting_ops = []
self.splitting_ops = []
for k, v in self.inductor_passes.items():
if not isinstance(v, str):
@@ -3246,6 +3241,15 @@ class CompilationConfig(BaseModel):
self.bs_to_padded_graph_size[
self.max_capture_size] = self.max_capture_size
def set_splitting_ops_for_v1(self):
# If default, override splitting ops for piecewise cudagraph on V1.
# NOTE: this function needs to be called
if not self.splitting_ops:
self.splitting_ops = [
"vllm.unified_attention",
"vllm.unified_attention_with_output",
]
@dataclass
class VllmConfig:
@@ -3297,6 +3301,7 @@ class VllmConfig:
vllm_factors: list[Any] = []
from vllm import __version__
vllm_factors.append(__version__)
vllm_factors.append(envs.VLLM_USE_V1)
if self.model_config:
vllm_factors.append(self.model_config.compute_hash())
else:
@@ -3460,6 +3465,7 @@ class VllmConfig:
# CUDA graphs do not work properly with the custom CUDA kernels.
# FIXME(woosuk): Disable inductor to reduce the compilation time
# and avoid any potential issues with the inductor.
# FIXME(rob): Add function to set all of these.
self.compilation_config.custom_ops = ["none"]
self.compilation_config.use_cudagraph = True
self.compilation_config.use_inductor = True
@@ -3467,6 +3473,7 @@ class VllmConfig:
self.compilation_config.pass_config.enable_fusion = False
self.compilation_config.pass_config.enable_noop = False
self.compilation_config.level = CompilationLevel.PIECEWISE
self.compilation_config.set_splitting_ops_for_v1()
self._set_cudagraph_sizes()