[V1] Detokenizer: Respect Stop Tokens + not include_stop_str_in_output (#14624)

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
This commit is contained in:
afeldman-nm
2025-03-13 15:07:34 -04:00
committed by GitHub
parent 8a4a2efc6f
commit 02fcaa3d0a
4 changed files with 215 additions and 18 deletions

View File

@@ -88,7 +88,8 @@ class IncrementalDetokenizer:
stop_buffer_length=stop_buffer_length,
)
def update(self, new_token_ids: list[int]) -> Optional[str]:
def update(self, new_token_ids: list[int],
stop_terminated: bool) -> Optional[str]:
"""
Update RequestState for the request_id by:
1) Detokenize the new token ids incrementally.
@@ -96,11 +97,22 @@ class IncrementalDetokenizer:
Return matched stop string or None.
"""
if not new_token_ids:
# Skip detokenization if no new token ids
return None
if self.tokenizer is None:
# Skip detokenization if no tokenizer
self.token_ids.extend(new_token_ids)
return None
if stop_terminated and not self.include_stop_str_in_output:
# If stop-terminated, exclude last token from detokenization
# based on include_stop_str_in_output parameter.
skipped_stop_token_id = new_token_ids[-1]
new_token_ids = new_token_ids[:-1]
else:
skipped_stop_token_id = None
# 1) Detokenize the new token ids incrementally.
# TODO(woosuk): This method becomes very inefficient when the number of
# new_token_ids is more than 1. We need to optimize this.
@@ -127,7 +139,14 @@ class IncrementalDetokenizer:
self.output_text += decoded_text
# 2) Evaluate stop criteria.
if stop_terminated:
if skipped_stop_token_id is not None:
# Cleanup after skipping detokenization
self.token_ids.append(skipped_stop_token_id)
# Stop token triggered; skip stop string check
return None
# 2) Evaluate stop strings.
stop_string = None
if self.stop:
stop = StopChecker.check_stop_strings(

View File

@@ -299,9 +299,9 @@ class OutputProcessor:
# in the EngineCore.
req_state.is_prefilling = not new_token_ids
# 2) Detokenize the token ids into text and check for stop
# strings.
stop_string = req_state.detokenizer.update(new_token_ids)
# 2) Detokenize the token ids into text and perform stop checks.
stop_string = req_state.detokenizer.update(
new_token_ids, finish_reason == FinishReason.STOP)
if stop_string and finish_reason != FinishReason.STOP:
finish_reason = FinishReason.STOP
stop_reason = stop_string