Add ability to use CUDAGraphs with use_inductor=False (#17345)

Signed-off-by: rzou <zou3519@gmail.com>
2025-05-28 22:16:52 -04:00
parent 515b413ebf
commit 26b4fa45be
5 changed files with 51 additions and 13 deletions
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -12,6 +12,7 @@ import torch._inductor.compile_fx
 import torch.fx as fx

 import vllm.envs as envs
+from vllm.compilation.counter import compilation_counter
 from vllm.config import VllmConfig
 from vllm.utils import is_torch_equal_or_newer

@@ -175,6 +176,7 @@ class InductorStandaloneAdaptor(CompilerInterface):
        runtime_shape: Optional[int] = None,
        key: Optional[str] = None,
    ) -> tuple[Optional[Callable], Optional[Any]]:
+        compilation_counter.num_inductor_compiles += 1
        current_config = {}
        if compiler_config is not None:
            current_config.update(compiler_config)
@@ -262,6 +264,7 @@ class InductorAdaptor(CompilerInterface):
        runtime_shape: Optional[int] = None,
        key: Optional[str] = None,
    ) -> tuple[Optional[Callable], Optional[Any]]:
+        compilation_counter.num_inductor_compiles += 1
        from torch._inductor.compile_fx import compile_fx
        current_config = {}
        if compiler_config is not None:
@@ -528,6 +531,7 @@ class EagerAdaptor(CompilerInterface):
        runtime_shape: Optional[int] = None,
        key: Optional[str] = None,
    ) -> tuple[Optional[Callable], Optional[Any]]:
+        compilation_counter.num_eager_compiles += 1
        # we don't need to compile the graph, just return the graph itself.
        # It does not support caching, return None for the handle.
        return graph, None