Revert "[compile] Initialize passes at VllmBackend init" (#37733)

This commit is contained in:
Simon Mo
2026-03-20 21:35:49 -07:00
committed by GitHub
parent 3ffa52009f
commit 1fa1e53a73
3 changed files with 5 additions and 19 deletions

View File

@@ -32,9 +32,9 @@ from vllm.platforms import current_platform
def test_compile_config_repr_succeeds():
# setup: VllmBackend mutates the config object
# Note: VllmBackend.__init__ already calls configure_post_pass()
config = VllmConfig()
_ = VllmBackend(config)
backend = VllmBackend(config)
backend.configure_post_pass()
# test that repr(config) succeeds
val = repr(config)

View File

@@ -836,18 +836,8 @@ class VllmBackend:
# in future we need PostGradPassManager.uuid() to be executed
# only at compile time.
self.inductor_config = deepcopy(self.compilation_config.inductor_compile_config)
# Configure post-grad passes (including AllReduceFusionPass) during
# backend init rather than at torch.compile time, so that expensive
# one-time setup (e.g. FlashInfer workspace allocation) is not
# attributed to compilation latency.
start = time.time()
self.configure_post_pass()
logger.info_once(
"Post-grad pass configuration time: %.2f s",
time.time() - start,
scope="local",
)
# `torch.compile` is JIT compiled, so we don't need to
# do anything here
def collect_standalone_compile_artifacts(
self,
@@ -1128,6 +1118,7 @@ class VllmBackend:
assert not self._called, "VllmBackend can only be called once"
self.graph = graph
self.configure_post_pass()
if self.compilation_config.use_inductor_graph_partition:
# Let Inductor decide partitioning; avoid FX-level pre-splitting.

View File

@@ -380,11 +380,6 @@ def _support_torch_compile(
compilation_counter.num_models_seen += 1
self.compiled = False
# Skip if a parent class's @support_torch_compile already
# initialized the compile wrapper
if hasattr(self, "_compiled_callable"):
return
# Handled by monkeypatching `TorchCompileWithNoGuardsWrapper` into base class
TorchCompileWithNoGuardsWrapper.__init__(
self,