diff --git a/tests/test_config.py b/tests/test_config.py index ee5ad0528..f98b30f99 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -32,9 +32,9 @@ from vllm.platforms import current_platform def test_compile_config_repr_succeeds(): # setup: VllmBackend mutates the config object - # Note: VllmBackend.__init__ already calls configure_post_pass() config = VllmConfig() - _ = VllmBackend(config) + backend = VllmBackend(config) + backend.configure_post_pass() # test that repr(config) succeeds val = repr(config) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 9d5b4bc93..e049ef345 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -836,18 +836,8 @@ class VllmBackend: # in future we need PostGradPassManager.uuid() to be executed # only at compile time. self.inductor_config = deepcopy(self.compilation_config.inductor_compile_config) - - # Configure post-grad passes (including AllReduceFusionPass) during - # backend init rather than at torch.compile time, so that expensive - # one-time setup (e.g. FlashInfer workspace allocation) is not - # attributed to compilation latency. - start = time.time() - self.configure_post_pass() - logger.info_once( - "Post-grad pass configuration time: %.2f s", - time.time() - start, - scope="local", - ) + # `torch.compile` is JIT compiled, so we don't need to + # do anything here def collect_standalone_compile_artifacts( self, @@ -1128,6 +1118,7 @@ class VllmBackend: assert not self._called, "VllmBackend can only be called once" self.graph = graph + self.configure_post_pass() if self.compilation_config.use_inductor_graph_partition: # Let Inductor decide partitioning; avoid FX-level pre-splitting. diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 605dc2364..5ecc82e31 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -380,11 +380,6 @@ def _support_torch_compile( compilation_counter.num_models_seen += 1 self.compiled = False - # Skip if a parent class's @support_torch_compile already - # initialized the compile wrapper - if hasattr(self, "_compiled_callable"): - return - # Handled by monkeypatching `TorchCompileWithNoGuardsWrapper` into base class TorchCompileWithNoGuardsWrapper.__init__( self,