diff --git a/kimi_k2_reasoning_parser.py b/kimi_k2_reasoning_parser.py index b579b63..06b9f66 100644 --- a/kimi_k2_reasoning_parser.py +++ b/kimi_k2_reasoning_parser.py @@ -144,29 +144,89 @@ class KimiK2ReasoningParser(ReasoningParser): # ------------------------------------------------------------------ def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: - """Check if reasoning has ended. - - IMPORTANT: Always returns False for this parser. The reasoning - state is tracked internally by ``_reasoning_ended`` which is - updated only when ``extract_reasoning_streaming`` detects - think-end or a tool-section marker in the model's *generated* - text. - - The vLLM serving layer calls this with ``prompt_token_ids`` to - pre-compute whether reasoning has ended. On multi-turn - conversations, the prompt contains think-end tokens from prior - assistant messages, which would cause a false positive — the - serving layer would skip ``extract_reasoning_streaming`` entirely - and route all thinking text to content. - - Returning False ensures the serving layer always calls - ``extract_reasoning_streaming``, which correctly handles the - transition using generated text only. + """Check if reasoning has ended based on the token ID sequence. + + Scans backward to find the last think-start or think-end token. + Returns True only if the last relevant token is a think-end or + a tool-section-start, AND there is no think-start after it. + + CRITICAL: When called with prompt_token_ids (as the vLLM serving + layer does), the input contains the full chat history. On + multi-turn conversations, the prompt ends with tokens from the + prior assistant message, which may include think-end. However, + this think-end belongs to the PRIOR generation — the new + generation will start its own reasoning with think-start. + + To handle this correctly, we check whether the input ends with + a complete reasoning block (think-start ... think-end). If the + last think token is think-end AND it's followed by non-reasoning + tokens (like tool_call tokens or end-of-sequence), we return + True. But if the input is just the prompt with no generated + tokens yet, we return False because the new generation hasn't + started reasoning yet. + + The key insight: in the chat template for multi-turn, after the + last assistant message's think-end, the template adds + <|im_end|> followed by new user/assistant markers. The + assistant generation prompt ends with <|im_assistant|> and + <|im_middle|> — no think tokens. So if we scan backward and + find think-end but then find prompt-end tokens (not think-start) + after it, we know reasoning ended in a PRIOR turn, not the + current one. We return False to let the new generation start + fresh. """ if self._identity_parser is not None: return self._identity_parser.is_reasoning_end(input_ids) - - return False + + # Scan backward to find the last think-start or think-end + # or tool-section-start token. + last_start = -1 + last_end = -1 + last_tool_section = -1 + + for i in range(len(input_ids) - 1, -1, -1): + if input_ids[i] == self._start_token_id and last_start == -1: + last_start = i + if input_ids[i] == self._end_token_id and last_end == -1: + last_end = i + if input_ids[i] in self._tool_section_start_token_ids and last_tool_section == -1: + last_tool_section = i + # Stop early if we found think-start — it's the boundary + if last_start != -1: + break + + # No think tokens at all — not a reasoning model output + if last_start == -1 and last_end == -1 and last_tool_section == -1: + return False + + # think-start is the last relevant token — reasoning is in progress + if last_start != -1 and (last_end == -1 or last_start > last_end): + return False + + # think-end or tool-section is the last relevant token. + # This could be from the prompt (prior turn) or from generated + # tokens. For prompt tokens on multi-turn, the think-end is + # from a prior assistant message and the new generation hasn't + # started yet — we should return False. + # + # Heuristic: if think-end appears but is followed by more tokens + # (like <|im_end|>, user markers, etc.), it's from the prompt + # and reasoning hasn't started in the current generation yet. + # Return False. + # + # If think-end is the very last token or near the end, it's + # from generated tokens and reasoning has ended. Return True. + last_relevant = max(last_end, last_tool_section) + tokens_after = len(input_ids) - 1 - last_relevant + + # If there are more than a few tokens after the last think-end, + # those are prompt tokens (chat template wrapping), meaning + # the think-end is from a prior turn. Return False. + if tokens_after > 3: + return False + + return True + def is_reasoning_end_streaming( self, input_ids: Sequence[int], delta_ids: Iterable[int] ) -> bool: