diff --git a/tests/tool_parsers/test_gemma4_tool_parser.py b/tests/tool_parsers/test_gemma4_tool_parser.py index 80cf70d6c..26722e68d 100644 --- a/tests/tool_parsers/test_gemma4_tool_parser.py +++ b/tests/tool_parsers/test_gemma4_tool_parser.py @@ -502,3 +502,32 @@ class TestStreamingExtraction: results = self._simulate_streaming(parser, mock_request, chunks) name = self._collect_function_name(results) assert name == "get_status" + + def test_streaming_split_delimiter_no_invalid_json(self, parser, mock_request): + """Partial <|"|> delimiter chars must not leak into streamed JSON. + + Reproduces the bug from https://github.com/vllm-project/vllm/issues/38946 + where a token boundary splits the string delimiter, leaving fragments + like '<|' at the end of a parsed value which then corrupt the JSON. + """ + chunks = [ + "<|tool_call>", + "call:todowrite{", + 'content:<|"|>Buy milk<|', + '"|>}', + "", + ] + + results = self._simulate_streaming(parser, mock_request, chunks) + + args_text = self._collect_arguments(results) + assert args_text, "No arguments were streamed" + + # Must be valid JSON — the original bug caused a JSON parse error + parsed_args = json.loads(args_text) + assert parsed_args["content"] == "Buy milk" + + # Ensure no raw delimiter fragments leaked into the JSON + assert "<|" not in args_text, ( + f"Partial delimiter leaked into JSON: {args_text!r}" + ) diff --git a/vllm/tool_parsers/gemma4_tool_parser.py b/vllm/tool_parsers/gemma4_tool_parser.py index 3d0e4e7c4..406ba9e70 100644 --- a/vllm/tool_parsers/gemma4_tool_parser.py +++ b/vllm/tool_parsers/gemma4_tool_parser.py @@ -675,10 +675,11 @@ class Gemma4ToolParser(ToolParser): current_args_json = json.dumps(current_args, ensure_ascii=False) # Withhold trailing closing characters that may shift as more - # tokens arrive. Strip trailing '}', '"', and ']' sequences - # to get the "safe prefix". + # tokens arrive. Strip trailing '}', '"', ']' and partial + # STRING_DELIM fragments ('<', '|', '\\', '>') to get the + # "safe prefix". safe_json = current_args_json - while safe_json and safe_json[-1] in ("}", '"', "]"): + while safe_json and safe_json[-1] in ("}", '"', "]", "<", "|", "\\", ">"): safe_json = safe_json[:-1] prev_streamed = self.streamed_args_for_tool[self.current_tool_id]