[V1][Spec Decode] Respect prompt_lookup_max (#15348)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Woosuk Kwon
2025-03-23 10:41:44 -07:00
committed by GitHub
parent 6ebaf9ac71
commit b9bd76ca14
3 changed files with 67 additions and 5 deletions

View File

@@ -160,6 +160,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self.drafter.propose(
np.zeros(1024, dtype=np.int32),
self.speculative_config.prompt_lookup_min,
self.speculative_config.prompt_lookup_max,
self.speculative_config.num_speculative_tokens,
)
self.rejection_sampler = RejectionSampler()
@@ -1155,6 +1156,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
drafter_output = self.drafter.propose(
self.input_batch.token_ids_cpu[i, :end_idx],
self.speculative_config.prompt_lookup_min,
self.speculative_config.prompt_lookup_max,
self.speculative_config.num_speculative_tokens,
)
if drafter_output is None or len(drafter_output) == 0: