[Bugfix] Re-enable use_cudagraph in vLLM v1 (#19299)

Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-06-07 20:56:12 -04:00
parent d77f7fb871
commit eaa2e51088
6 changed files with 52 additions and 8 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3918,12 +3918,14 @@ class CompilationConfig:
    constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""

    # CudaGraph compilation
-    use_cudagraph: bool = False
+    use_cudagraph: bool = envs.VLLM_USE_V1
    """Whether to use cudagraph inside compilation.
    - False: cudagraph inside compilation is not used.
    - True: cudagraph inside compilation is used. It requires
        that all input buffers have fixed addresses, and all
        splitting ops write their outputs to input buffers.
+    In the vLLM V1 Engine, this flag only applies for
+    CompilationLevel.PIECEWISE (aka -O3).
    Note that this is orthogonal to the cudagraph capture logic
    outside of compilation.
    TODO: move outside cudagraph logic into compilation.
@@ -4425,7 +4427,6 @@ class VllmConfig:
            # FIXME(rob): Add function to set all of these.
            if not self.compilation_config.custom_ops:
                self.compilation_config.custom_ops = ["none"]
-            self.compilation_config.use_cudagraph = True
            self.compilation_config.cudagraph_num_of_warmups = 1
            self.compilation_config.pass_config.enable_fusion = False
            self.compilation_config.pass_config.enable_noop = False