[BugFix][Spec Decode] Fix spec token ids in model runner (#20530)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-07-06 12:44:52 -07:00
parent 9fb52e523a
commit 9528e3a05e
1 changed files with 12 additions and 11 deletions
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -528,18 +528,19 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                    start_token_index:end_token_index] = new_token_ids
                self.input_batch.num_tokens_no_spec[
                    req_index] = end_token_index
                self.input_batch.num_tokens[req_index] = end_token_index
            # Add spec_token_ids to token_ids_cpu.
            spec_token_ids = (
-                    scheduler_output.scheduled_spec_decode_tokens.get(
+                scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()))
                        req_id, ()))
            if spec_token_ids:
-                    start_index = end_token_index
+                num_spec_tokens = len(spec_token_ids)
-                    end_token_index += len(spec_token_ids)
+                start_index = self.input_batch.num_tokens_no_spec[req_index]
                end_token_index = start_index + num_spec_tokens
                self.input_batch.token_ids_cpu[
-                        req_index,
+                    req_index, start_index:end_token_index] = spec_token_ids
                        start_index:end_token_index] = spec_token_ids
                # NOTE(woosuk): `num_tokens` here may include spec tokens.
-                self.input_batch.num_tokens[req_index] = end_token_index
+                self.input_batch.num_tokens[req_index] += num_spec_tokens
        # Add the new or resumed requests to the persistent batch.
        # The smaller empty indices are filled first.