[V1] Detokenizer: Respect Stop Tokens + not include_stop_str_in_output (#14624)

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
2025-03-13 15:07:34 -04:00
parent 8a4a2efc6f
commit 02fcaa3d0a
4 changed files with 215 additions and 18 deletions
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -88,7 +88,8 @@ class IncrementalDetokenizer:
            stop_buffer_length=stop_buffer_length,
        )

-    def update(self, new_token_ids: list[int]) -> Optional[str]:
+    def update(self, new_token_ids: list[int],
+               stop_terminated: bool) -> Optional[str]:
        """
        Update RequestState for the request_id by:
            1) Detokenize the new token ids incrementally.
@@ -96,11 +97,22 @@ class IncrementalDetokenizer:

        Return matched stop string or None.
        """
-
+        if not new_token_ids:
+            # Skip detokenization if no new token ids
+            return None
        if self.tokenizer is None:
+            # Skip detokenization if no tokenizer
            self.token_ids.extend(new_token_ids)
            return None

+        if stop_terminated and not self.include_stop_str_in_output:
+            # If stop-terminated, exclude last token from detokenization
+            # based on include_stop_str_in_output parameter.
+            skipped_stop_token_id = new_token_ids[-1]
+            new_token_ids = new_token_ids[:-1]
+        else:
+            skipped_stop_token_id = None
+
        # 1) Detokenize the new token ids incrementally.
        # TODO(woosuk): This method becomes very inefficient when the number of
        # new_token_ids is more than 1. We need to optimize this.
@@ -127,7 +139,14 @@ class IncrementalDetokenizer:

        self.output_text += decoded_text

-        # 2) Evaluate stop criteria.
+        if stop_terminated:
+            if skipped_stop_token_id is not None:
+                # Cleanup after skipping detokenization
+                self.token_ids.append(skipped_stop_token_id)
+            # Stop token triggered; skip stop string check
+            return None
+
+        # 2) Evaluate stop strings.
        stop_string = None
        if self.stop:
            stop = StopChecker.check_stop_strings(
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -299,9 +299,9 @@ class OutputProcessor:
            # in the EngineCore.
            req_state.is_prefilling = not new_token_ids

-            # 2) Detokenize the token ids into text and check for stop
-            #    strings.
-            stop_string = req_state.detokenizer.update(new_token_ids)
+            # 2) Detokenize the token ids into text and perform stop checks.
+            stop_string = req_state.detokenizer.update(
+                new_token_ids, finish_reason == FinishReason.STOP)
            if stop_string and finish_reason != FinishReason.STOP:
                finish_reason = FinishReason.STOP
                stop_reason = stop_string