[V1] V1 Enablement Oracle (#13726)
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Michael Goin <michael@neuralmagic.com>
This commit is contained in:
@@ -1140,6 +1140,10 @@ class CacheConfig:
|
||||
if self.cache_dtype == "auto":
|
||||
pass
|
||||
elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2"):
|
||||
if envs.VLLM_USE_V1:
|
||||
raise NotImplementedError(
|
||||
"V1 does not yet support fp8 KV cache. "
|
||||
"Set VLLM_USE_V1=0 to enable fp8 kv cache.")
|
||||
logger.info(
|
||||
"Using fp8 data type to store kv cache. It reduces the GPU "
|
||||
"memory footprint and boosts the performance. "
|
||||
@@ -3142,16 +3146,7 @@ class CompilationConfig(BaseModel):
|
||||
self.inductor_compile_config[KEY] = False
|
||||
|
||||
if self.splitting_ops is None:
|
||||
if envs.VLLM_USE_V1:
|
||||
# v1 must split the graph on attention ops
|
||||
# for piecewise cudagraph
|
||||
self.splitting_ops = [
|
||||
"vllm.unified_attention",
|
||||
"vllm.unified_attention_with_output",
|
||||
]
|
||||
else:
|
||||
# v0 uses full graph compilation
|
||||
self.splitting_ops = []
|
||||
self.splitting_ops = []
|
||||
|
||||
for k, v in self.inductor_passes.items():
|
||||
if not isinstance(v, str):
|
||||
@@ -3246,6 +3241,15 @@ class CompilationConfig(BaseModel):
|
||||
self.bs_to_padded_graph_size[
|
||||
self.max_capture_size] = self.max_capture_size
|
||||
|
||||
def set_splitting_ops_for_v1(self):
|
||||
# If default, override splitting ops for piecewise cudagraph on V1.
|
||||
# NOTE: this function needs to be called
|
||||
if not self.splitting_ops:
|
||||
self.splitting_ops = [
|
||||
"vllm.unified_attention",
|
||||
"vllm.unified_attention_with_output",
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class VllmConfig:
|
||||
@@ -3297,6 +3301,7 @@ class VllmConfig:
|
||||
vllm_factors: list[Any] = []
|
||||
from vllm import __version__
|
||||
vllm_factors.append(__version__)
|
||||
vllm_factors.append(envs.VLLM_USE_V1)
|
||||
if self.model_config:
|
||||
vllm_factors.append(self.model_config.compute_hash())
|
||||
else:
|
||||
@@ -3460,6 +3465,7 @@ class VllmConfig:
|
||||
# CUDA graphs do not work properly with the custom CUDA kernels.
|
||||
# FIXME(woosuk): Disable inductor to reduce the compilation time
|
||||
# and avoid any potential issues with the inductor.
|
||||
# FIXME(rob): Add function to set all of these.
|
||||
self.compilation_config.custom_ops = ["none"]
|
||||
self.compilation_config.use_cudagraph = True
|
||||
self.compilation_config.use_inductor = True
|
||||
@@ -3467,6 +3473,7 @@ class VllmConfig:
|
||||
self.compilation_config.pass_config.enable_fusion = False
|
||||
self.compilation_config.pass_config.enable_noop = False
|
||||
self.compilation_config.level = CompilationLevel.PIECEWISE
|
||||
self.compilation_config.set_splitting_ops_for_v1()
|
||||
|
||||
self._set_cudagraph_sizes()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user