[Model Runner V2][Minor] Remove redundant do_spec_decode field (#35039)

Signed-off-by: Nick Hill <nickhill123@gmail.com>
Co-authored-by: Woosuk Kwon <woosuk@inferact.ai>
This commit is contained in:
Nick Hill
2026-02-22 16:18:04 -08:00
committed by GitHub
parent 2bcf71b9c0
commit 944ffb5968

View File

@@ -153,9 +153,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self.cp_interleave = self.parallel_config.cp_kv_cache_interleave_size
self.speculator = None
self.num_speculative_steps = 0
self.use_aux_hidden_state_outputs = False
if self.speculative_config is not None:
self.do_spec_decode = True
self.num_speculative_steps = self.speculative_config.num_speculative_tokens
if self.is_last_pp_rank:
self.speculator = init_speculator(self.vllm_config, self.device)
@@ -165,9 +165,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self.use_aux_hidden_state_outputs = True
if self.pp_size > 1:
raise ValueError("EAGLE3 with pipeline parallel is not supported.")
else:
self.do_spec_decode = False
self.num_speculative_steps = 0
# Draft tokens propagation - for spec-dec + struct outputs.
self.draft_tokens_handler = DraftTokensHandler(self.device)
@@ -251,10 +248,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
)
prepare_communication_buffer_for_model(self.model)
if self.do_spec_decode:
speculator_model = getattr(self.speculator, "model", None)
if speculator_model is not None:
prepare_communication_buffer_for_model(speculator_model)
if self.speculator is not None:
prepare_communication_buffer_for_model(self.speculator)
def get_model(self) -> nn.Module:
return self.model