[Model Runner V2][Minor] Remove redundant do_spec_decode field (#35039)
Signed-off-by: Nick Hill <nickhill123@gmail.com> Co-authored-by: Woosuk Kwon <woosuk@inferact.ai>
This commit is contained in:
@@ -153,9 +153,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
self.cp_interleave = self.parallel_config.cp_kv_cache_interleave_size
|
||||
|
||||
self.speculator = None
|
||||
self.num_speculative_steps = 0
|
||||
self.use_aux_hidden_state_outputs = False
|
||||
if self.speculative_config is not None:
|
||||
self.do_spec_decode = True
|
||||
self.num_speculative_steps = self.speculative_config.num_speculative_tokens
|
||||
if self.is_last_pp_rank:
|
||||
self.speculator = init_speculator(self.vllm_config, self.device)
|
||||
@@ -165,9 +165,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
self.use_aux_hidden_state_outputs = True
|
||||
if self.pp_size > 1:
|
||||
raise ValueError("EAGLE3 with pipeline parallel is not supported.")
|
||||
else:
|
||||
self.do_spec_decode = False
|
||||
self.num_speculative_steps = 0
|
||||
|
||||
# Draft tokens propagation - for spec-dec + struct outputs.
|
||||
self.draft_tokens_handler = DraftTokensHandler(self.device)
|
||||
@@ -251,10 +248,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
)
|
||||
|
||||
prepare_communication_buffer_for_model(self.model)
|
||||
if self.do_spec_decode:
|
||||
speculator_model = getattr(self.speculator, "model", None)
|
||||
if speculator_model is not None:
|
||||
prepare_communication_buffer_for_model(speculator_model)
|
||||
if self.speculator is not None:
|
||||
prepare_communication_buffer_for_model(self.speculator)
|
||||
|
||||
def get_model(self) -> nn.Module:
|
||||
return self.model
|
||||
|
||||
Reference in New Issue
Block a user