[Core] Make cudagraph check cuda platform only (#23005)

Signed-off-by: Chengji Yao <chengjiyao@gmail.com>
Signed-off-by: Chengji Yao <chengjiyao@google.com>
Co-authored-by: Chengji Yao <chengjiyao@gmail.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
This commit is contained in:
Chengji Yao
2025-08-16 00:46:00 -07:00
committed by GitHub
parent cc826a202b
commit 933f45334a

View File

@@ -3535,15 +3535,6 @@ class VllmConfig:
# in V0 means the compilation level wins out. # in V0 means the compilation level wins out.
self.compilation_config.level = CompilationLevel.NO_COMPILATION self.compilation_config.level = CompilationLevel.NO_COMPILATION
# if cudagraph_mode is not explicitly set by users, set default value
if self.compilation_config.cudagraph_mode is None:
if envs.VLLM_USE_V1 and self.compilation_config.level \
== CompilationLevel.PIECEWISE:
self.compilation_config.cudagraph_mode = \
CUDAGraphMode.PIECEWISE
else:
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
# async tp is built on top of sequence parallelism # async tp is built on top of sequence parallelism
# and requires it to be enabled. # and requires it to be enabled.
if self.compilation_config.pass_config.enable_async_tp: if self.compilation_config.pass_config.enable_async_tp:
@@ -3552,14 +3543,28 @@ class VllmConfig:
if self.compilation_config.pass_config.enable_sequence_parallelism: if self.compilation_config.pass_config.enable_sequence_parallelism:
self.compilation_config.custom_ops.append("+rms_norm") self.compilation_config.custom_ops.append("+rms_norm")
# disable cudagraph when enforce eager execution if current_platform.is_cuda_alike():
if self.model_config is not None and self.model_config.enforce_eager: # if cudagraph_mode is not explicitly set by users, set default
logger.info("Cudagraph is disabled under eager mode") # value
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE if self.compilation_config.cudagraph_mode is None:
elif envs.VLLM_USE_V1: if envs.VLLM_USE_V1 and self.compilation_config.level \
self.compilation_config.cudagraph_num_of_warmups = 1 == CompilationLevel.PIECEWISE:
self.compilation_config.cudagraph_mode = \
CUDAGraphMode.PIECEWISE
else:
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
self._set_cudagraph_sizes() # disable cudagraph when enforce eager execution
if self.model_config is not None and \
self.model_config.enforce_eager:
logger.info("Cudagraph is disabled under eager mode")
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
elif envs.VLLM_USE_V1:
self.compilation_config.cudagraph_num_of_warmups = 1
self._set_cudagraph_sizes()
else:
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
if self.cache_config.cpu_offload_gb > 0 and \ if self.cache_config.cpu_offload_gb > 0 and \
self.compilation_config.level != CompilationLevel.NO_COMPILATION \ self.compilation_config.level != CompilationLevel.NO_COMPILATION \
@@ -3618,7 +3623,7 @@ class VllmConfig:
current_platform.check_and_update_config(self) current_platform.check_and_update_config(self)
# final check of cudagraph mode after platform-specific update # final check of cudagraph mode after platform-specific update
if envs.VLLM_USE_V1: if envs.VLLM_USE_V1 and current_platform.is_cuda_alike():
if self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL \ if self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL \
and self.model_config is not None and \ and self.model_config is not None and \
not self.model_config.disable_cascade_attn: not self.model_config.disable_cascade_attn: