From 944ffb59680c0210ec54ddb43a3c7ef015e1f842 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sun, 22 Feb 2026 16:18:04 -0800 Subject: [PATCH] [Model Runner V2][Minor] Remove redundant `do_spec_decode` field (#35039) Signed-off-by: Nick Hill Co-authored-by: Woosuk Kwon --- vllm/v1/worker/gpu/model_runner.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index cdea0b2aa..8204fd3c3 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -153,9 +153,9 @@ class GPUModelRunner(LoRAModelRunnerMixin): self.cp_interleave = self.parallel_config.cp_kv_cache_interleave_size self.speculator = None + self.num_speculative_steps = 0 self.use_aux_hidden_state_outputs = False if self.speculative_config is not None: - self.do_spec_decode = True self.num_speculative_steps = self.speculative_config.num_speculative_tokens if self.is_last_pp_rank: self.speculator = init_speculator(self.vllm_config, self.device) @@ -165,9 +165,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): self.use_aux_hidden_state_outputs = True if self.pp_size > 1: raise ValueError("EAGLE3 with pipeline parallel is not supported.") - else: - self.do_spec_decode = False - self.num_speculative_steps = 0 # Draft tokens propagation - for spec-dec + struct outputs. self.draft_tokens_handler = DraftTokensHandler(self.device) @@ -251,10 +248,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): ) prepare_communication_buffer_for_model(self.model) - if self.do_spec_decode: - speculator_model = getattr(self.speculator, "model", None) - if speculator_model is not None: - prepare_communication_buffer_for_model(speculator_model) + if self.speculator is not None: + prepare_communication_buffer_for_model(self.speculator) def get_model(self) -> nn.Module: return self.model