diff --git a/tests/entrypoints/openai/parser/test_harmony_utils.py b/tests/entrypoints/openai/parser/test_harmony_utils.py index 1d34fc51a..b73a0b074 100644 --- a/tests/entrypoints/openai/parser/test_harmony_utils.py +++ b/tests/entrypoints/openai/parser/test_harmony_utils.py @@ -2,7 +2,11 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -from openai.types.responses import ResponseFunctionToolCall, ResponseReasoningItem +from openai.types.responses import ( + ResponseFunctionToolCall, + ResponseOutputMessage, + ResponseReasoningItem, +) from openai.types.responses.response_output_item import McpCall from openai_harmony import Author, Message, Role, TextContent @@ -10,6 +14,7 @@ from tests.entrypoints.openai.utils import verify_harmony_messages from vllm.entrypoints.openai.parser.harmony_utils import ( auto_drop_analysis_messages, get_encoding, + get_system_message, has_custom_tools, parse_chat_input_to_harmony_message, parse_chat_output, @@ -840,15 +845,58 @@ class TestParseChatOutput: assert reasoning == "I've thought hard about this." assert final_content == "The answer is 4." + def test_parse_chat_output_commentary_with_recipient_excluded(self) -> None: + """Commentary with a recipient (tool call) should not appear in + final_content — those are handled separately by the tool parser. + + The first message is a preamble (visible), the second is a tool + call (excluded). Only the preamble should appear in final_content. + """ + harmony_str = ( + "<|channel|>commentary" + "<|message|>Let me check the weather.<|end|>" + "<|start|>assistant to=functions.get_weather" + "<|channel|>commentary" + '<|message|>{"location": "SF"}<|end|>' + ) + token_ids = get_encoding().encode(harmony_str, allowed_special="all") + reasoning, final_content, _ = parse_chat_output(token_ids) + assert reasoning is None + assert final_content == "Let me check the weather." + + def test_parse_chat_output_interrupted_preamble(self) -> None: + """Partial/interrupted preamble (commentary without recipient) should + appear in final_content, not reasoning.""" + harmony_str = "<|channel|>commentary<|message|>I'll search for that" + token_ids = get_encoding().encode(harmony_str, allowed_special="all") + reasoning, final_content, _ = parse_chat_output(token_ids) + assert reasoning is None + assert final_content == "I'll search for that" + + def test_parse_chat_output_preamble_then_final(self) -> None: + """Preamble followed by a final message should both appear in + final_content, joined by newline.""" + harmony_str = ( + "<|channel|>commentary" + "<|message|>Let me look that up.<|end|>" + "<|start|>assistant<|channel|>final" + "<|message|>The answer is 42.<|end|>" + ) + token_ids = get_encoding().encode(harmony_str, allowed_special="all") + reasoning, final_content, _ = parse_chat_output(token_ids) + assert reasoning is None + assert final_content == "Let me look that up.\nThe answer is 42." + class TestParseOutputMessage: """Tests for parse_output_message function.""" - def test_commentary_with_no_recipient_creates_reasoning(self): - """Test that commentary with recipient=None (preambles) creates reasoning items. + def test_commentary_with_no_recipient_creates_message(self): + """Test that commentary with recipient=None (preambles) creates message items. - Per Harmony format, commentary channel can contain preambles to calling - multiple functions - explanatory text with no recipient. + Per Harmony format, preambles are intended to be shown to end-users, + unlike analysis channel content which is hidden reasoning. + See: https://cookbook.openai.com/articles/openai-harmony """ message = Message.from_role_and_content( Role.ASSISTANT, "I will now search for the weather information." @@ -859,13 +907,16 @@ class TestParseOutputMessage: output_items = parse_output_message(message) assert len(output_items) == 1 - assert isinstance(output_items[0], ResponseReasoningItem) - assert output_items[0].type == "reasoning" + assert isinstance(output_items[0], ResponseOutputMessage) + assert output_items[0].type == "message" + assert output_items[0].role == "assistant" + assert output_items[0].status == "completed" + assert len(output_items[0].content) == 1 + assert output_items[0].content[0].type == "output_text" assert ( output_items[0].content[0].text == "I will now search for the weather information." ) - assert output_items[0].content[0].type == "reasoning_text" def test_commentary_with_function_recipient_creates_function_call(self): """Test commentary with recipient='functions.X' creates function calls.""" @@ -944,7 +995,7 @@ class TestParseOutputMessage: output_items = parse_output_message(message) assert len(output_items) == 1 - assert isinstance(output_items[0], ResponseReasoningItem) + assert isinstance(output_items[0], ResponseOutputMessage) assert output_items[0].content[0].text == "" def test_commentary_with_multiple_contents_and_no_recipient(self): @@ -958,10 +1009,13 @@ class TestParseOutputMessage: output_items = parse_output_message(message) - assert len(output_items) == 2 - assert all(isinstance(item, ResponseReasoningItem) for item in output_items) + # _parse_final_message returns single ResponseOutputMessage with + # multiple contents + assert len(output_items) == 1 + assert isinstance(output_items[0], ResponseOutputMessage) + assert len(output_items[0].content) == 2 assert output_items[0].content[0].text == "Step 1: Analyze the request" - assert output_items[1].content[0].text == "Step 2: Prepare to call functions" + assert output_items[0].content[1].text == "Step 2: Prepare to call functions" def test_commentary_with_multiple_function_calls(self): """Test multiple function calls in commentary channel.""" @@ -1133,7 +1187,7 @@ def test_parse_remaining_state_commentary_channel() -> None: assert mcp_items[0].status == "in_progress" # Test 3: Built-in tool (python) - # should NOT return MCP call, falls through to reasoning + # should NOT return MCP call, returns reasoning (internal tool interaction) parser_builtin = Mock() parser_builtin.current_content = "print('hello')" parser_builtin.current_role = Role.ASSISTANT @@ -1142,11 +1196,26 @@ def test_parse_remaining_state_commentary_channel() -> None: builtin_items = parse_remaining_state(parser_builtin) - # Should fall through to reasoning logic + # Built-in tools explicitly return reasoning assert len(builtin_items) == 1 assert not isinstance(builtin_items[0], McpCall) assert builtin_items[0].type == "reasoning" + # Test 4: No recipient (preamble) → should return message, not reasoning + parser_preamble = Mock() + parser_preamble.current_content = "I'll search for that information now." + parser_preamble.current_role = Role.ASSISTANT + parser_preamble.current_channel = "commentary" + parser_preamble.current_recipient = None + + preamble_items = parse_remaining_state(parser_preamble) + + assert len(preamble_items) == 1 + assert isinstance(preamble_items[0], ResponseOutputMessage) + assert preamble_items[0].type == "message" + assert preamble_items[0].content[0].text == "I'll search for that information now." + assert preamble_items[0].status == "incomplete" # streaming + def test_parse_remaining_state_analysis_channel() -> None: """Test parse_remaining_state with analysis channel and various recipients.""" @@ -1199,3 +1268,29 @@ def test_parse_remaining_state_analysis_channel() -> None: assert len(builtin_items) == 1 assert not isinstance(builtin_items[0], McpCall) assert builtin_items[0].type == "reasoning" + + +class TestGetSystemMessage: + """Tests for get_system_message channel configuration.""" + + def test_commentary_channel_present_without_custom_tools(self) -> None: + """Commentary channel must be valid even without custom tools.""" + sys_msg = get_system_message(with_custom_tools=False) + valid_channels = sys_msg.content[0].channel_config.valid_channels + assert "commentary" in valid_channels + + def test_commentary_channel_present_with_custom_tools(self) -> None: + """Commentary channel present when custom tools are enabled.""" + sys_msg = get_system_message(with_custom_tools=True) + valid_channels = sys_msg.content[0].channel_config.valid_channels + assert "commentary" in valid_channels + + def test_all_standard_channels_present(self) -> None: + """All three standard Harmony channels should always be valid.""" + for with_tools in (True, False): + sys_msg = get_system_message(with_custom_tools=with_tools) + valid_channels = sys_msg.content[0].channel_config.valid_channels + for channel in ("analysis", "commentary", "final"): + assert channel in valid_channels, ( + f"{channel} missing when with_custom_tools={with_tools}" + ) diff --git a/tests/entrypoints/openai/responses/test_harmony.py b/tests/entrypoints/openai/responses/test_harmony.py index af7de2026..78419c92a 100644 --- a/tests/entrypoints/openai/responses/test_harmony.py +++ b/tests/entrypoints/openai/responses/test_harmony.py @@ -712,15 +712,14 @@ async def test_function_calling_required(client: OpenAI, model_name: str): async def test_system_message_with_tools(client: OpenAI, model_name: str): from vllm.entrypoints.openai.parser.harmony_utils import get_system_message - # Test with custom tools enabled - commentary channel should be available - sys_msg = get_system_message(with_custom_tools=True) - valid_channels = sys_msg.content[0].channel_config.valid_channels - assert "commentary" in valid_channels - - # Test with custom tools disabled - commentary channel should be removed - sys_msg = get_system_message(with_custom_tools=False) - valid_channels = sys_msg.content[0].channel_config.valid_channels - assert "commentary" not in valid_channels + # Commentary channel should always be present (needed for preambles) + # regardless of whether custom tools are enabled + for with_tools in (True, False): + sys_msg = get_system_message(with_custom_tools=with_tools) + valid_channels = sys_msg.content[0].channel_config.valid_channels + assert "commentary" in valid_channels, ( + f"commentary channel missing when with_custom_tools={with_tools}" + ) @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/responses/test_mcp_tools.py b/tests/entrypoints/openai/responses/test_mcp_tools.py index add199b61..310af4308 100644 --- a/tests/entrypoints/openai/responses/test_mcp_tools.py +++ b/tests/entrypoints/openai/responses/test_mcp_tools.py @@ -172,13 +172,13 @@ class TestMCPEnabled: recipient = message.get("recipient") if recipient and recipient.startswith("python"): tool_call_found = True - assert message.get("channel") == "analysis" + assert message.get("channel") == "commentary" author = message.get("author", {}) if author.get("role") == "tool" and (author.get("name") or "").startswith( "python" ): tool_response_found = True - assert message.get("channel") == "analysis" + assert message.get("channel") == "commentary" assert tool_call_found, ( f"No Python tool call found. " diff --git a/tests/entrypoints/openai/test_serving_chat_stream_harmony.py b/tests/entrypoints/openai/test_serving_chat_stream_harmony.py index 21d3d02ce..9f8c36f04 100644 --- a/tests/entrypoints/openai/test_serving_chat_stream_harmony.py +++ b/tests/entrypoints/openai/test_serving_chat_stream_harmony.py @@ -180,20 +180,13 @@ class TestExtractHarmonyStreamingDelta: assert delta_message.tool_calls[0].index == 1 - @pytest.mark.parametrize( - "channel,recipient", - [ - ("commentary", None), - ("commentary", "browser.search"), - ], - ) - def test_returns_tool_call_preambles(self, channel, recipient): - """Test that invalid tool recipient on commentary is treated as content.""" + def test_returns_preambles_as_content(self): + """Test that commentary with no recipient (preamble) is user content.""" parser = MockStreamableParser() delta_text = "some text" token_states = [ - TokenState(channel=channel, recipient=recipient, text=delta_text) + TokenState(channel="commentary", recipient=None, text=delta_text) ] delta_message, tools_streamed = extract_harmony_streaming_delta( @@ -211,6 +204,7 @@ class TestExtractHarmonyStreamingDelta: [ (None, None), ("unknown_channel", None), + ("commentary", "browser.search"), ], ) def test_returns_none_for_invalid_inputs(self, channel, recipient): diff --git a/tests/entrypoints/openai/test_serving_responses.py b/tests/entrypoints/openai/test_serving_responses.py index 5cf07ac0f..291bfd442 100644 --- a/tests/entrypoints/openai/test_serving_responses.py +++ b/tests/entrypoints/openai/test_serving_responses.py @@ -26,6 +26,9 @@ from vllm.entrypoints.openai.responses.serving import ( _extract_allowed_tools_from_mcp_requests, extract_tool_types, ) +from vllm.entrypoints.openai.responses.streaming_events import ( + StreamingState, +) from vllm.inputs.data import TokensPrompt from vllm.outputs import CompletionOutput, RequestOutput from vllm.sampling_params import SamplingParams @@ -439,3 +442,115 @@ class TestExtractAllowedToolsFromMcpRequests: "server1": ["tool1"], "server2": ["tool2"], } + + +class TestHarmonyPreambleStreaming: + """Tests for preamble (commentary with no recipient) streaming events.""" + + @staticmethod + def _make_ctx(*, channel, recipient, delta="hello"): + """Build a lightweight mock StreamingHarmonyContext.""" + ctx = MagicMock() + ctx.last_content_delta = delta + ctx.parser.current_channel = channel + ctx.parser.current_recipient = recipient + return ctx + + @staticmethod + def _make_previous_item(*, channel, recipient, text="preamble text"): + """Build a lightweight mock previous_item (openai_harmony Message).""" + content_part = MagicMock() + content_part.text = text + item = MagicMock() + item.channel = channel + item.recipient = recipient + item.content = [content_part] + return item + + def test_preamble_delta_emits_text_events(self) -> None: + """commentary + recipient=None should emit output_text.delta events.""" + from vllm.entrypoints.openai.responses.streaming_events import ( + emit_content_delta_events, + ) + + ctx = self._make_ctx(channel="commentary", recipient=None) + state = StreamingState() + + events = emit_content_delta_events(ctx, state) + + type_names = [e.type for e in events] + assert "response.output_text.delta" in type_names + assert "response.output_item.added" in type_names + + def test_preamble_delta_second_token_no_added(self) -> None: + """Second preamble token should emit delta only, not added again.""" + from vllm.entrypoints.openai.responses.streaming_events import ( + emit_content_delta_events, + ) + + ctx = self._make_ctx(channel="commentary", recipient=None, delta="w") + state = StreamingState() + state.sent_output_item_added = True + state.current_item_id = "msg_test" + state.current_content_index = 0 + + events = emit_content_delta_events(ctx, state) + + type_names = [e.type for e in events] + assert "response.output_text.delta" in type_names + assert "response.output_item.added" not in type_names + + def test_commentary_with_function_recipient_not_preamble(self) -> None: + """commentary + recipient='functions.X' must NOT use preamble path.""" + from vllm.entrypoints.openai.responses.streaming_events import ( + emit_content_delta_events, + ) + + ctx = self._make_ctx( + channel="commentary", + recipient="functions.get_weather", + ) + state = StreamingState() + + events = emit_content_delta_events(ctx, state) + + type_names = [e.type for e in events] + assert "response.output_text.delta" not in type_names + + def test_preamble_done_emits_text_done_events(self) -> None: + """Completed preamble should emit text done + content_part done + + output_item done, same shape as final channel.""" + from vllm.entrypoints.openai.responses.streaming_events import ( + emit_previous_item_done_events, + ) + + previous = self._make_previous_item(channel="commentary", recipient=None) + state = StreamingState() + state.current_item_id = "msg_test" + state.current_output_index = 0 + state.current_content_index = 0 + + events = emit_previous_item_done_events(previous, state) + + type_names = [e.type for e in events] + assert "response.output_text.done" in type_names + assert "response.content_part.done" in type_names + assert "response.output_item.done" in type_names + + def test_commentary_with_recipient_no_preamble_done(self) -> None: + """commentary + recipient='functions.X' should route to function call + done, not preamble done.""" + from vllm.entrypoints.openai.responses.streaming_events import ( + emit_previous_item_done_events, + ) + + previous = self._make_previous_item( + channel="commentary", recipient="functions.get_weather" + ) + state = StreamingState() + state.current_item_id = "fc_test" + + events = emit_previous_item_done_events(previous, state) + + type_names = [e.type for e in events] + assert "response.output_text.done" not in type_names diff --git a/tests/entrypoints/test_context.py b/tests/entrypoints/test_context.py index 1ab2b5edb..b1c8df4fa 100644 --- a/tests/entrypoints/test_context.py +++ b/tests/entrypoints/test_context.py @@ -236,6 +236,44 @@ def test_reasoning_tokens_counting(mock_parser): assert context.num_output_tokens == 4 +def test_preamble_tokens_not_counted_as_reasoning(mock_parser): + """Preambles (commentary with no recipient) are visible user text, + not hidden reasoning. They must NOT inflate num_reasoning_tokens.""" + context = HarmonyContext(messages=[], available_tools=[]) + + mock_parser.current_channel = "commentary" + mock_parser.current_recipient = None # preamble + + mock_output = create_mock_request_output( + prompt_token_ids=[1, 2, 3], + output_token_ids=[4, 5, 6], + num_cached_tokens=0, + ) + context.append_output(mock_output) + + assert context.num_reasoning_tokens == 0 + assert context.num_output_tokens == 3 + + +def test_commentary_with_recipient_counted_as_reasoning(mock_parser): + """Commentary directed at a tool (recipient != None) is hidden from + the user, so it should still count as reasoning tokens.""" + context = HarmonyContext(messages=[], available_tools=[]) + + mock_parser.current_channel = "commentary" + mock_parser.current_recipient = "python" + + mock_output = create_mock_request_output( + prompt_token_ids=[1, 2, 3], + output_token_ids=[4, 5, 6], + num_cached_tokens=0, + ) + context.append_output(mock_output) + + assert context.num_reasoning_tokens == 3 + assert context.num_output_tokens == 3 + + def test_zero_tokens_edge_case(): """Test behavior with all zero token counts.""" context = HarmonyContext(messages=[], available_tools=[]) diff --git a/vllm/entrypoints/openai/chat_completion/stream_harmony.py b/vllm/entrypoints/openai/chat_completion/stream_harmony.py index 4dbdddd20..87f2f9b92 100644 --- a/vllm/entrypoints/openai/chat_completion/stream_harmony.py +++ b/vllm/entrypoints/openai/chat_completion/stream_harmony.py @@ -147,7 +147,7 @@ def extract_harmony_streaming_delta( function=DeltaFunctionCall(arguments=group.text), ) ) - elif group.channel == "commentary": + elif group.channel == "commentary" and group.recipient is None: # Tool call preambles meant to be shown to the user combined_content += group.text content_encountered = True diff --git a/vllm/entrypoints/openai/parser/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py index 486873db8..9dfd5f518 100644 --- a/vllm/entrypoints/openai/parser/harmony_utils.py +++ b/vllm/entrypoints/openai/parser/harmony_utils.py @@ -26,7 +26,6 @@ from openai.types.responses.response_reasoning_item import ( from openai.types.responses.tool import Tool from openai_harmony import ( Author, - ChannelConfig, Conversation, DeveloperContent, HarmonyEncodingName, @@ -126,13 +125,6 @@ def get_system_message( sys_msg_content = sys_msg_content.with_tools(python_description) if container_description is not None: sys_msg_content = sys_msg_content.with_tools(container_description) - if not with_custom_tools: - channel_config = sys_msg_content.channel_config - invalid_channel = "commentary" - new_config = ChannelConfig.require_channels( - [c for c in channel_config.valid_channels if c != invalid_channel] - ) - sys_msg_content = sys_msg_content.with_channel_config(new_config) sys_msg = Message.from_role_and_content(Role.SYSTEM, sys_msg_content) return sys_msg @@ -686,6 +678,22 @@ def _parse_mcp_call(message: Message, recipient: str) -> list[ResponseOutputItem return output_items +def _parse_message_no_recipient( + message: Message, +) -> list[ResponseOutputItem]: + """Parse a Harmony message with no recipient based on its channel.""" + if message.channel == "analysis": + return _parse_reasoning(message) + + if message.channel in ("commentary", "final"): + # Per Harmony format, preambles (commentary with no recipient) and + # final channel content are both intended to be shown to end-users. + # See: https://cookbook.openai.com/articles/openai-harmony + return [_parse_final_message(message)] + + raise ValueError(f"Unknown channel: {message.channel}") + + def parse_output_message(message: Message) -> list[ResponseOutputItem]: """ Parse a Harmony message into a list of output response items. @@ -717,19 +725,8 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]: output_items.extend(_parse_mcp_call(message, recipient)) # No recipient - handle based on channel for non-tool messages - elif message.channel == "analysis": - output_items.extend(_parse_reasoning(message)) - - elif message.channel == "commentary": - # Per Harmony format, commentary channel can contain preambles to calling - # multiple functions - explanatory text with no recipient - output_items.extend(_parse_reasoning(message)) - - elif message.channel == "final": - output_items.append(_parse_final_message(message)) - else: - raise ValueError(f"Unknown channel: {message.channel}") + output_items.extend(_parse_message_no_recipient(message)) return output_items @@ -786,7 +783,26 @@ def parse_remaining_state(parser: StreamableParser) -> list[ResponseOutputItem]: ) ] - if parser.current_channel in ("commentary", "analysis"): + if parser.current_channel == "commentary": + # Per Harmony format, preambles (commentary with no recipient) are + # intended to be shown to end-users, unlike analysis channel content. + output_text = ResponseOutputText( + text=parser.current_content, + annotations=[], + type="output_text", + logprobs=None, + ) + return [ + ResponseOutputMessage( + id=f"msg_{random_uuid()}", + content=[output_text], + role="assistant", + status="incomplete", + type="message", + ) + ] + + if parser.current_channel == "analysis": return [ ResponseReasoningItem( id=f"rs_{random_uuid()}", @@ -855,17 +871,30 @@ def parse_chat_output( is_tool_call = False # TODO: update this when tool call is supported # Get completed messages from the parser + # - analysis channel: hidden reasoning + # - commentary channel without recipient (preambles): visible to user + # - final channel: visible to user + # - commentary with recipient (tool calls): handled separately by tool parser reasoning_texts = [ msg.content[0].text for msg in output_msgs if msg.channel == "analysis" ] final_texts = [ - msg.content[0].text for msg in output_msgs if msg.channel != "analysis" + msg.content[0].text + for msg in output_msgs + if msg.channel == "final" or (msg.channel == "commentary" and not msg.recipient) ] # Extract partial messages from the parser if parser.current_channel == "analysis" and parser.current_content: reasoning_texts.append(parser.current_content) - elif parser.current_channel != "analysis" and parser.current_content: + elif parser.current_channel == "final" and parser.current_content: + final_texts.append(parser.current_content) + elif ( + parser.current_channel == "commentary" + and not parser.current_recipient + and parser.current_content + ): + # Preambles (commentary without recipient) are visible to user final_texts.append(parser.current_content) # Flatten multiple messages into a single string diff --git a/vllm/entrypoints/openai/responses/context.py b/vllm/entrypoints/openai/responses/context.py index b57adeeb8..bab59e0aa 100644 --- a/vllm/entrypoints/openai/responses/context.py +++ b/vllm/entrypoints/openai/responses/context.py @@ -540,8 +540,12 @@ class HarmonyContext(ConversationContext): self.first_tok_of_message = True # For streaming support def _update_num_reasoning_tokens(self): - # Count all analysis and commentary channels as reasoning tokens - if self.parser.current_channel in {"analysis", "commentary"}: + channel = self.parser.current_channel + if channel == "analysis": + self.num_reasoning_tokens += 1 + elif channel == "commentary" and self.parser.current_recipient is not None: + # Tool interactions (python/browser/container) are hidden. + # Preambles (recipient=None) are visible user text. self.num_reasoning_tokens += 1 def append_output(self, output: RequestOutput) -> None: diff --git a/vllm/entrypoints/openai/responses/streaming_events.py b/vllm/entrypoints/openai/responses/streaming_events.py index 49d2b99da..cc242e7ba 100644 --- a/vllm/entrypoints/openai/responses/streaming_events.py +++ b/vllm/entrypoints/openai/responses/streaming_events.py @@ -563,7 +563,9 @@ def emit_content_delta_events( channel = ctx.parser.current_channel recipient = ctx.parser.current_recipient - if channel == "final" and recipient is None: + if channel in ("final", "commentary") and recipient is None: + # Preambles (commentary with no recipient) and final messages + # are both user-visible text. return emit_text_delta_events(delta, state) elif channel == "analysis" and recipient is None: return emit_reasoning_delta_events(delta, state) @@ -607,7 +609,9 @@ def emit_previous_item_done_events( return emit_mcp_completion_events(previous_item.recipient, text, state) elif previous_item.channel == "analysis": return emit_reasoning_done_events(text, state) - elif previous_item.channel == "final": + elif previous_item.channel in ("commentary", "final"): + # Preambles (commentary with no recipient) and final messages + # are both user-visible text. return emit_text_output_done_events(text, state) return []