[V1][Spec Decode] Respect prompt_lookup_max (#15348)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
@@ -160,6 +160,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
self.drafter.propose(
|
||||
np.zeros(1024, dtype=np.int32),
|
||||
self.speculative_config.prompt_lookup_min,
|
||||
self.speculative_config.prompt_lookup_max,
|
||||
self.speculative_config.num_speculative_tokens,
|
||||
)
|
||||
self.rejection_sampler = RejectionSampler()
|
||||
@@ -1155,6 +1156,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
drafter_output = self.drafter.propose(
|
||||
self.input_batch.token_ids_cpu[i, :end_idx],
|
||||
self.speculative_config.prompt_lookup_min,
|
||||
self.speculative_config.prompt_lookup_max,
|
||||
self.speculative_config.num_speculative_tokens,
|
||||
)
|
||||
if drafter_output is None or len(drafter_output) == 0:
|
||||
|
||||
Reference in New Issue
Block a user