[CI][Bugfix]: return McpCall for built-in MCP tools in non-streaming mode (#32762)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-04 21:14:06 -06:00
parent 72bb24e2db
commit fb1270f1f8
6 changed files with 131 additions and 28 deletions
--- a/tests/entrypoints/openai/responses/test_harmony.py
+++ b/tests/entrypoints/openai/responses/test_harmony.py
@@ -62,7 +62,7 @@ async def client(server):
 async def test_basic(client: OpenAI, model_name: str):
    response = await client.responses.create(
        model=model_name,
-        input="What is 13 * 24?",
+        input="What is 123 * 456?",
    )
    assert response is not None
    print("response: ", response)
@@ -74,7 +74,7 @@ async def test_basic(client: OpenAI, model_name: str):
 async def test_basic_with_instructions(client: OpenAI, model_name: str):
    response = await client.responses.create(
        model=model_name,
-        input="What is 13 * 24?",
+        input="What is 123 * 456?",
        instructions="Respond in Korean.",
    )
    assert response is not None
@@ -116,7 +116,7 @@ async def test_chat(client: OpenAI, model_name: str):
            {"role": "system", "content": "Respond in Korean."},
            {"role": "user", "content": "Hello!"},
            {"role": "assistant", "content": "Hello! How can I help you today?"},
-            {"role": "user", "content": "What is 13 * 24? Explain your answer."},
+            {"role": "user", "content": "What is 123 * 456? Explain your answer."},
        ],
    )
    assert response is not None
@@ -131,7 +131,7 @@ async def test_chat_with_input_type(client: OpenAI, model_name: str):
        input=[
            {
                "role": "user",
-                "content": [{"type": "input_text", "text": "What is 13*24?"}],
+                "content": [{"type": "input_text", "text": "What is 123 * 456?"}],
            },
        ],
    )
@@ -200,7 +200,7 @@ async def test_store(client: OpenAI, model_name: str):
    for store in [True, False]:
        response = await client.responses.create(
            model=model_name,
-            input="What is 13 * 24?",
+            input="What is 123 * 456?",
            store=store,
        )
        assert response is not None
@@ -219,7 +219,7 @@ async def test_store(client: OpenAI, model_name: str):
 async def test_background(client: OpenAI, model_name: str):
    response = await client.responses.create(
        model=model_name,
-        input="What is 13 * 24?",
+        input="What is 123 * 456?",
        background=True,
    )
    assert response is not None
@@ -256,7 +256,7 @@ async def test_background_cancel(client: OpenAI, model_name: str):
 async def test_stateful_multi_turn(client: OpenAI, model_name: str):
    response1 = await client.responses.create(
        model=model_name,
-        input="What is 13 * 24?",
+        input="What is 123 * 456?",
    )
    assert response1 is not None
    assert response1.status == "completed"
@@ -361,7 +361,7 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool):
    # TODO: Add back when web search and code interpreter are available in CI
    prompts = [
        "tell me a story about a cat in 20 words",
-        "What is 13 * 24? Use python to calculate the result.",
+        "What is 123 * 456? Use python to calculate the result.",
        # "When did Jensen found NVIDIA? Search it and answer the year only.",
    ]

@@ -976,6 +976,9 @@ async def test_mcp_code_interpreter_streaming(client: OpenAI, model_name: str, s

@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.dependency(
+    depends=["test_mcp_code_interpreter_streaming[openai/gpt-oss-20b]"]
+)
 async def test_mcp_tool_multi_turn(client: OpenAI, model_name: str, server):
    """Test MCP tool calling across multiple turns.

@@ -1117,8 +1120,10 @@ async def test_function_call_with_previous_input_messages(
        model=model_name,
        input="What is the horoscope for Aquarius today?",
        tools=tools,
+        temperature=0.0,
        extra_body={"enable_response_messages": True},
        stream=True,
+        max_output_tokens=1000,
    )

    response = None
@@ -1170,6 +1175,7 @@ async def test_function_call_with_previous_input_messages(
    stream_response_2 = await client.responses.create(
        model=model_name,
        tools=tools,
+        temperature=0.0,
        input="",
        extra_body={
            "previous_input_messages": previous_messages,
--- a/tests/entrypoints/openai/responses/test_mcp_tools.py
+++ b/tests/entrypoints/openai/responses/test_mcp_tools.py
@@ -160,6 +160,7 @@ class TestMCPEnabled:
                "No developer messages should be present with valid mcp tool"
            )

+    @pytest.mark.flaky(reruns=3)
    @pytest.mark.asyncio
    @pytest.mark.parametrize("model_name", [MODEL_NAME])
    async def test_mcp_tool_with_allowed_tools_star(
--- a/tests/entrypoints/openai/responses/test_parsable_context.py
+++ b/tests/entrypoints/openai/responses/test_parsable_context.py
@@ -53,7 +53,7 @@ async def client(server):
 async def test_basic(client: OpenAI, model_name: str):
    response = await client.responses.create(
        model=model_name,
-        input="What is 13 * 24?",
+        input="What is 123 * 456?",
    )
    assert response is not None
    print("response: ", response)
@@ -164,7 +164,7 @@ async def test_function_call_first_turn(client: OpenAI, model_name: str):
 async def test_mcp_tool_call(client: OpenAI, model_name: str):
    response = await client.responses.create(
        model=model_name,
-        input="What is 13 * 24? Use python to calculate the result.",
+        input="What is 123 * 456? Use python to calculate the result.",
        tools=[{"type": "code_interpreter", "container": {"type": "auto"}}],
        extra_body={"enable_response_messages": True},
        temperature=0.0,
@@ -179,12 +179,12 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
    assert response.output[2].type == "reasoning"
    # make sure the correct math is in the final output
    assert response.output[3].type == "message"
-    assert "312" in response.output[3].content[0].text
+    assert "56088" in response.output[3].content[0].text

    # test raw input_messages / output_messages
    assert len(response.input_messages) == 1
    assert len(response.output_messages) == 3
-    assert "312" in response.output_messages[2]["message"]
+    assert "56088" in response.output_messages[2]["message"]


@pytest.mark.asyncio
--- a/tests/entrypoints/openai/responses/test_simple.py
+++ b/tests/entrypoints/openai/responses/test_simple.py
@@ -34,7 +34,7 @@ async def client(server):
 async def test_basic(client: OpenAI, model_name: str):
    response = await client.responses.create(
        model=model_name,
-        input="What is 13 * 24?",
+        input="What is 123 * 456?",
    )
    assert response is not None
    print("response: ", response)
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -197,12 +197,86 @@ class RemoteOpenAIServer:
        return self

    def __exit__(self, exc_type, exc_value, traceback):
+        pid = self.proc.pid
+        # Graceful shutdown
        self.proc.terminate()
        try:
-            self.proc.wait(8)
+            self.proc.wait(timeout=15)
+            print(f"[RemoteOpenAIServer] Server {pid} terminated gracefully")
        except subprocess.TimeoutExpired:
-            # force kill if needed
+            print(
+                f"[RemoteOpenAIServer] Server {pid} did not respond "
+                "to SIGTERM, sending SIGKILL"
+            )
            self.proc.kill()
+            try:
+                self.proc.wait(timeout=5)
+                print(f"[RemoteOpenAIServer] Server {pid} killed")
+            except subprocess.TimeoutExpired as err:
+                raise RuntimeError(
+                    f"[RemoteOpenAIServer] Failed to kill server process {pid}"
+                ) from err
+        # Wait for GPU memory to be released
+        self._wait_for_gpu_memory_release()
+
+    def _get_gpu_memory_used(self) -> float | None:
+        """Get total GPU memory used across all visible devices in bytes."""
+        try:
+            if current_platform.is_rocm():
+                with _nvml():
+                    handles = amdsmi_get_processor_handles()
+                    total_used = 0
+                    for handle in handles:
+                        vram_info = amdsmi_get_gpu_vram_usage(handle)
+                        total_used += vram_info["vram_used"]
+                    return total_used
+            elif current_platform.is_cuda():
+                with _nvml():
+                    total_used = 0
+                    device_count = cuda_device_count_stateless()
+                    for i in range(device_count):
+                        handle = nvmlDeviceGetHandleByIndex(i)
+                        mem_info = nvmlDeviceGetMemoryInfo(handle)
+                        total_used += mem_info.used
+                    return total_used
+        except Exception as e:
+            print(f"[RemoteOpenAIServer] Could not query GPU memory: {e}")
+            return None
+        return None
+
+    def _wait_for_gpu_memory_release(self, timeout: float = 30.0):
+        """Poll GPU memory until it stabilizes, indicating cleanup is complete."""
+        start = time.time()
+        prev_used: float | None = None
+        stable_count = 0
+
+        while time.time() - start < timeout:
+            used = self._get_gpu_memory_used()
+
+            if used is None:
+                return  # Can't query, assume ok
+
+            if prev_used is not None and abs(used - prev_used) < 100 * 1024 * 1024:
+                stable_count += 1
+                if stable_count >= 3:
+                    used_gb = used / 1e9
+                    print(
+                        f"[RemoteOpenAIServer] GPU memory stabilized "
+                        f"at {used_gb:.2f} GB"
+                    )
+                    return
+            else:
+                stable_count = 0
+
+            prev_used = used
+            time.sleep(0.1)
+
+        last_reading = prev_used / 1e9 if prev_used is not None else 0.0
+        raise RuntimeError(
+            f"[RemoteOpenAIServer] GPU memory did not stabilize within {timeout}s. "
+            f"Last reading: {last_reading:.2f} GB. "
+            "Child processes may still be holding GPU memory."
+        )

    def _poll(self) -> int | None:
        """Subclasses override this method to customize process polling"""
--- a/vllm/entrypoints/openai/parser/harmony_utils.py
+++ b/vllm/entrypoints/openai/parser/harmony_utils.py
@@ -68,6 +68,14 @@ MCP_BUILTIN_TOOLS: set[str] = {
    "container",
 }

+# Mapping from built-in tool recipient names to their MCP server labels.
+# This ensures consistency between streaming and non-streaming responses.
+_BUILTIN_TOOL_TO_MCP_SERVER_LABEL: dict[str, str] = {
+    "python": "code_interpreter",
+    "browser": "web_search_preview",
+    "container": "container",
+}
+

 def has_custom_tools(tool_types: set[str]) -> bool:
    """
@@ -601,7 +609,13 @@ def _parse_mcp_recipient(recipient: str) -> tuple[str, str]:

 def _parse_mcp_call(message: Message, recipient: str) -> list[ResponseOutputItem]:
    """Parse MCP calls into MCP call items."""
-    server_label, tool_name = _parse_mcp_recipient(recipient)
+    # Handle built-in tools that need server_label mapping
+    if recipient in _BUILTIN_TOOL_TO_MCP_SERVER_LABEL:
+        server_label = _BUILTIN_TOOL_TO_MCP_SERVER_LABEL[recipient]
+        tool_name = recipient
+    else:
+        server_label, tool_name = _parse_mcp_recipient(recipient)
+
    output_items = []
    for content in message.content:
        response_item = McpCall(
@@ -630,7 +644,7 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
    recipient = message.recipient

    if recipient is not None:
-        # Browser tool calls
+        # Browser tool calls (browser.search, browser.open, browser.find)
        if recipient.startswith("browser."):
            output_items.append(_parse_browser_tool_call(message, recipient))

@@ -638,10 +652,8 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
        elif message.channel == "commentary" and recipient.startswith("functions."):
            output_items.extend(_parse_function_call(message, recipient))

-        # Built-in tools are treated as reasoning
-        elif recipient.startswith(("python", "browser", "container")):
-            # Built-in tool recipients (python/browser/container)
-            # generate reasoning output
+        # Built-in MCP tools (python, browser, container)
+        elif recipient in _BUILTIN_TOOL_TO_MCP_SERVER_LABEL:
            output_items.extend(_parse_reasoning(message))

        # All other recipients are MCP calls
@@ -688,13 +700,23 @@ def parse_remaining_state(parser: StreamableParser) -> list[ResponseOutputItem]:
                    status="in_progress",
                )
            ]
-        # Built-in tools (python, browser, container) should be treated as reasoning
-        elif not (
-            current_recipient.startswith("python")
-            or current_recipient.startswith("browser")
-            or current_recipient.startswith("container")
-        ):
-            # All other recipients are MCP calls
+        # Built-in MCP tools (python, browser, container)
+        elif current_recipient in _BUILTIN_TOOL_TO_MCP_SERVER_LABEL:
+            return [
+                ResponseReasoningItem(
+                    id=f"rs_{random_uuid()}",
+                    summary=[],
+                    type="reasoning",
+                    content=[
+                        ResponseReasoningTextContent(
+                            text=parser.current_content, type="reasoning_text"
+                        )
+                    ],
+                    status=None,
+                )
+            ]
+        # All other recipients are MCP calls
+        else:
            rid = random_uuid()
            server_label, tool_name = _parse_mcp_recipient(current_recipient)
            return [