From f5266646ebbf8152c820a2c7c0ea04fe2acff545 Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Tue, 14 Apr 2026 08:21:14 +0000
Subject: [PATCH] Make is_reasoning_end() always return False
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The vLLM serving layer calls is_reasoning_end() with prompt_token_ids
to pre-compute whether reasoning has ended before streaming starts. On
multi-turn conversations, prompt_token_ids contains think-end tokens
from prior assistant messages in the chat history. This causes a false
positive — the serving layer sets reasoning_end_arr[i] = True, skips
extract_reasoning_streaming entirely, and routes all thinking text to
content.

By returning False, the serving layer always calls
extract_reasoning_streaming, which correctly tracks reasoning state
via _reasoning_ended based only on the model's generated text.
---
 kimi_k2_reasoning_parser.py | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)
diff --git a/kimi_k2_reasoning_parser.py b/kimi_k2_reasoning_parser.py
index 30ca951..b579b63 100644
--- a/kimi_k2_reasoning_parser.py
+++ b/kimi_k2_reasoning_parser.py
@@ -144,23 +144,29 @@ class KimiK2ReasoningParser(ReasoningParser):
     # ------------------------------------------------------------------
 
     def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
-        """Check if reasoning has ended by scanning the full token sequence.
-
-        Reasoning ends when we see either ``</think>`` or a tool-section
-        start token after the last ``<think>``.
+        """Check if reasoning has ended.
+        
+        IMPORTANT: Always returns False for this parser.  The reasoning
+        state is tracked internally by ``_reasoning_ended`` which is
+        updated only when ``extract_reasoning_streaming`` detects
+        think-end or a tool-section marker in the model's *generated*
+        text.
+        
+        The vLLM serving layer calls this with ``prompt_token_ids`` to
+        pre-compute whether reasoning has ended.  On multi-turn
+        conversations, the prompt contains think-end tokens from prior
+        assistant messages, which would cause a false positive — the
+        serving layer would skip ``extract_reasoning_streaming`` entirely
+        and route all thinking text to content.
+        
+        Returning False ensures the serving layer always calls
+        ``extract_reasoning_streaming``, which correctly handles the
+        transition using generated text only.
         """
         if self._identity_parser is not None:
             return self._identity_parser.is_reasoning_end(input_ids)
-
-        for i in range(len(input_ids) - 1, -1, -1):
-            if input_ids[i] == self._start_token_id:
-                return False
-            if input_ids[i] == self._end_token_id:
-                return True
-            if input_ids[i] in self._tool_section_start_token_ids:
-                return True
+        
         return False
-
     def is_reasoning_end_streaming(
         self, input_ids: Sequence[int], delta_ids: Iterable[int]
     ) -> bool: