From e1060a71a1bb96103ce9ca98345184dcdc982467 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 10 Feb 2026 02:54:41 -0500 Subject: [PATCH] [Perf] Optimize detokenizer python logic (#32975) Signed-off-by: yewentao256 Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: Nick Hill --- vllm/v1/engine/detokenizer.py | 12 ++++++++---- vllm/v1/engine/output_processor.py | 4 ++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index e77a316b2..18e4c98f8 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -35,6 +35,9 @@ class IncrementalDetokenizer: def output_token_ids(self) -> list[int]: return self.token_ids + def num_output_tokens(self) -> int: + return len(self.token_ids) + def update(self, new_token_ids: list[int], stop_terminated: bool) -> str | None: self.token_ids.extend(new_token_ids) return None @@ -112,14 +115,12 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC): skipped_stop_token_id = None # 1) Detokenize the new token ids incrementally. - # TODO(woosuk): This method becomes very inefficient when the number of - # new_token_ids is more than 1. We need to optimize this. stop_check_offset = len(self.output_text) for new_token_id in new_token_ids: self.token_ids.append(new_token_id) self.output_text += self.decode_next(new_token_id) # Support min_tokens, see https://github.com/vllm-project/vllm/pull/22014 - if self.min_tokens and len(self.output_token_ids) <= self.min_tokens: + if self.min_tokens and self.num_output_tokens() <= self.min_tokens: stop_check_offset = len(self.output_text) if skipped_stop_token_id is not None: @@ -128,7 +129,7 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC): # 2) Evaluate stop strings. stop_string = None - if self.stop and len(self.output_token_ids) > self.min_tokens: + if self.stop and self.num_output_tokens() > self.min_tokens: stop = check_stop_strings( output_text=self.output_text, new_char_count=len(self.output_text) - stop_check_offset, @@ -295,6 +296,9 @@ class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer): else (self.token_ids[self.prompt_len :]) ) + def num_output_tokens(self) -> int: + return len(self.token_ids) - self.prompt_len + def decode_next(self, next_token_id: int) -> str: new_tokens, decoded_text, prefix_offset, read_offset = detokenize_incrementally( tokenizer=self.tokenizer, diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 00a5355e0..58c73fbc6 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -292,7 +292,7 @@ class RequestState: if not ( finished or self.sent_tokens_offset == 0 - or len(self.detokenizer.output_token_ids) - self.sent_tokens_offset + or self.detokenizer.num_output_tokens() - self.sent_tokens_offset >= self.stream_interval ): return None @@ -303,7 +303,7 @@ class RequestState: new_token_ids = self.detokenizer.output_token_ids[ self.sent_tokens_offset : ] - self.sent_tokens_offset = len(self.detokenizer.output_token_ids) + self.sent_tokens_offset = self.detokenizer.num_output_tokens() external_req_id = self.external_req_id