From f5266646ebbf8152c820a2c7c0ea04fe2acff545 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Tue, 14 Apr 2026 08:21:14 +0000 Subject: [PATCH] Make is_reasoning_end() always return False MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vLLM serving layer calls is_reasoning_end() with prompt_token_ids to pre-compute whether reasoning has ended before streaming starts. On multi-turn conversations, prompt_token_ids contains think-end tokens from prior assistant messages in the chat history. This causes a false positive — the serving layer sets reasoning_end_arr[i] = True, skips extract_reasoning_streaming entirely, and routes all thinking text to content. By returning False, the serving layer always calls extract_reasoning_streaming, which correctly tracks reasoning state via _reasoning_ended based only on the model's generated text. --- kimi_k2_reasoning_parser.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/kimi_k2_reasoning_parser.py b/kimi_k2_reasoning_parser.py index 30ca951..b579b63 100644 --- a/kimi_k2_reasoning_parser.py +++ b/kimi_k2_reasoning_parser.py @@ -144,23 +144,29 @@ class KimiK2ReasoningParser(ReasoningParser): # ------------------------------------------------------------------ def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: - """Check if reasoning has ended by scanning the full token sequence. - - Reasoning ends when we see either ```` or a tool-section - start token after the last ````. + """Check if reasoning has ended. + + IMPORTANT: Always returns False for this parser. The reasoning + state is tracked internally by ``_reasoning_ended`` which is + updated only when ``extract_reasoning_streaming`` detects + think-end or a tool-section marker in the model's *generated* + text. + + The vLLM serving layer calls this with ``prompt_token_ids`` to + pre-compute whether reasoning has ended. On multi-turn + conversations, the prompt contains think-end tokens from prior + assistant messages, which would cause a false positive — the + serving layer would skip ``extract_reasoning_streaming`` entirely + and route all thinking text to content. + + Returning False ensures the serving layer always calls + ``extract_reasoning_streaming``, which correctly handles the + transition using generated text only. """ if self._identity_parser is not None: return self._identity_parser.is_reasoning_end(input_ids) - - for i in range(len(input_ids) - 1, -1, -1): - if input_ids[i] == self._start_token_id: - return False - if input_ids[i] == self._end_token_id: - return True - if input_ids[i] in self._tool_section_start_token_ids: - return True + return False - def is_reasoning_end_streaming( self, input_ids: Sequence[int], delta_ids: Iterable[int] ) -> bool: