[BugFix] Fix use_cudagraph=False (#19612)

Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-06-18 20:23:12 -04:00
parent d49adea1f9
commit ed33349738
3 changed files with 33 additions and 27 deletions
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -1,14 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
-import torch

 import vllm
 from vllm.compilation.counter import compilation_counter
-from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
-                         set_current_vllm_config)
-
-from .piecewise.test_simple import SillyModel
+from vllm.config import VllmConfig


 def test_use_cudagraphs_dynamic(monkeypatch):
@@ -22,23 +18,24 @@ def test_use_cudagraphs_dynamic(monkeypatch):


@pytest.mark.parametrize("enabled", [True, False])
-def test_use_cudagraphs(enabled):
+def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
    assert vllm.envs.VLLM_USE_V1
-    vllm_config = VllmConfig(compilation_config=CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
-        use_cudagraph=enabled,
-        cudagraph_capture_sizes=[100],
-    ))
-    with set_current_vllm_config(vllm_config):
-        model = SillyModel(vllm_config=vllm_config, prefix='')

-    inputs = torch.randn(100, device="cuda")
+    # Disable multiprocessing so that the counter is in the same process
+    monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')

-    with compilation_counter.expect(
-            num_graphs_seen=1,  # one graph for the model
-            num_cudagraph_captured=1 if enabled else 0,
-    ):
-        # first run is warmup
-        model(inputs)
-        # second run does CUDAGraphs recording (if enabled)
-        model(inputs)
+    compilation_config = {
+        "cudagraph_capture_sizes": [100],
+        "use_cudagraph": enabled,
+    }
+    with (
+            compilation_counter.expect(
+                num_graphs_seen=1,
+                num_gpu_runner_capture_triggers=1 if enabled else 0,
+                num_cudagraph_captured=13 if enabled else 0,
+            ),
+            # loading the model causes compilation (if enabled) to happen
+            vllm_runner('facebook/opt-125m',
+                        compilation_config=compilation_config,
+                        gpu_memory_utilization=0.4) as _):
+        pass