[BugFix][Spec Decode] Fix spec token ids in model runner (#20530)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
@@ -528,18 +528,19 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
start_token_index:end_token_index] = new_token_ids
|
start_token_index:end_token_index] = new_token_ids
|
||||||
self.input_batch.num_tokens_no_spec[
|
self.input_batch.num_tokens_no_spec[
|
||||||
req_index] = end_token_index
|
req_index] = end_token_index
|
||||||
|
self.input_batch.num_tokens[req_index] = end_token_index
|
||||||
|
|
||||||
# Add spec_token_ids to token_ids_cpu.
|
# Add spec_token_ids to token_ids_cpu.
|
||||||
spec_token_ids = (
|
spec_token_ids = (
|
||||||
scheduler_output.scheduled_spec_decode_tokens.get(
|
scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()))
|
||||||
req_id, ()))
|
|
||||||
if spec_token_ids:
|
if spec_token_ids:
|
||||||
start_index = end_token_index
|
num_spec_tokens = len(spec_token_ids)
|
||||||
end_token_index += len(spec_token_ids)
|
start_index = self.input_batch.num_tokens_no_spec[req_index]
|
||||||
|
end_token_index = start_index + num_spec_tokens
|
||||||
self.input_batch.token_ids_cpu[
|
self.input_batch.token_ids_cpu[
|
||||||
req_index,
|
req_index, start_index:end_token_index] = spec_token_ids
|
||||||
start_index:end_token_index] = spec_token_ids
|
|
||||||
# NOTE(woosuk): `num_tokens` here may include spec tokens.
|
# NOTE(woosuk): `num_tokens` here may include spec tokens.
|
||||||
self.input_batch.num_tokens[req_index] = end_token_index
|
self.input_batch.num_tokens[req_index] += num_spec_tokens
|
||||||
|
|
||||||
# Add the new or resumed requests to the persistent batch.
|
# Add the new or resumed requests to the persistent batch.
|
||||||
# The smaller empty indices are filled first.
|
# The smaller empty indices are filled first.
|
||||||
|
|||||||
Reference in New Issue
Block a user