[CI][Bugfix]: return McpCall for built-in MCP tools in non-streaming mode (#32762)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
This commit is contained in:
@@ -62,7 +62,7 @@ async def client(server):
|
|||||||
async def test_basic(client: OpenAI, model_name: str):
|
async def test_basic(client: OpenAI, model_name: str):
|
||||||
response = await client.responses.create(
|
response = await client.responses.create(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
input="What is 13 * 24?",
|
input="What is 123 * 456?",
|
||||||
)
|
)
|
||||||
assert response is not None
|
assert response is not None
|
||||||
print("response: ", response)
|
print("response: ", response)
|
||||||
@@ -74,7 +74,7 @@ async def test_basic(client: OpenAI, model_name: str):
|
|||||||
async def test_basic_with_instructions(client: OpenAI, model_name: str):
|
async def test_basic_with_instructions(client: OpenAI, model_name: str):
|
||||||
response = await client.responses.create(
|
response = await client.responses.create(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
input="What is 13 * 24?",
|
input="What is 123 * 456?",
|
||||||
instructions="Respond in Korean.",
|
instructions="Respond in Korean.",
|
||||||
)
|
)
|
||||||
assert response is not None
|
assert response is not None
|
||||||
@@ -116,7 +116,7 @@ async def test_chat(client: OpenAI, model_name: str):
|
|||||||
{"role": "system", "content": "Respond in Korean."},
|
{"role": "system", "content": "Respond in Korean."},
|
||||||
{"role": "user", "content": "Hello!"},
|
{"role": "user", "content": "Hello!"},
|
||||||
{"role": "assistant", "content": "Hello! How can I help you today?"},
|
{"role": "assistant", "content": "Hello! How can I help you today?"},
|
||||||
{"role": "user", "content": "What is 13 * 24? Explain your answer."},
|
{"role": "user", "content": "What is 123 * 456? Explain your answer."},
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
assert response is not None
|
assert response is not None
|
||||||
@@ -131,7 +131,7 @@ async def test_chat_with_input_type(client: OpenAI, model_name: str):
|
|||||||
input=[
|
input=[
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": [{"type": "input_text", "text": "What is 13*24?"}],
|
"content": [{"type": "input_text", "text": "What is 123 * 456?"}],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@@ -200,7 +200,7 @@ async def test_store(client: OpenAI, model_name: str):
|
|||||||
for store in [True, False]:
|
for store in [True, False]:
|
||||||
response = await client.responses.create(
|
response = await client.responses.create(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
input="What is 13 * 24?",
|
input="What is 123 * 456?",
|
||||||
store=store,
|
store=store,
|
||||||
)
|
)
|
||||||
assert response is not None
|
assert response is not None
|
||||||
@@ -219,7 +219,7 @@ async def test_store(client: OpenAI, model_name: str):
|
|||||||
async def test_background(client: OpenAI, model_name: str):
|
async def test_background(client: OpenAI, model_name: str):
|
||||||
response = await client.responses.create(
|
response = await client.responses.create(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
input="What is 13 * 24?",
|
input="What is 123 * 456?",
|
||||||
background=True,
|
background=True,
|
||||||
)
|
)
|
||||||
assert response is not None
|
assert response is not None
|
||||||
@@ -256,7 +256,7 @@ async def test_background_cancel(client: OpenAI, model_name: str):
|
|||||||
async def test_stateful_multi_turn(client: OpenAI, model_name: str):
|
async def test_stateful_multi_turn(client: OpenAI, model_name: str):
|
||||||
response1 = await client.responses.create(
|
response1 = await client.responses.create(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
input="What is 13 * 24?",
|
input="What is 123 * 456?",
|
||||||
)
|
)
|
||||||
assert response1 is not None
|
assert response1 is not None
|
||||||
assert response1.status == "completed"
|
assert response1.status == "completed"
|
||||||
@@ -361,7 +361,7 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool):
|
|||||||
# TODO: Add back when web search and code interpreter are available in CI
|
# TODO: Add back when web search and code interpreter are available in CI
|
||||||
prompts = [
|
prompts = [
|
||||||
"tell me a story about a cat in 20 words",
|
"tell me a story about a cat in 20 words",
|
||||||
"What is 13 * 24? Use python to calculate the result.",
|
"What is 123 * 456? Use python to calculate the result.",
|
||||||
# "When did Jensen found NVIDIA? Search it and answer the year only.",
|
# "When did Jensen found NVIDIA? Search it and answer the year only.",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -976,6 +976,9 @@ async def test_mcp_code_interpreter_streaming(client: OpenAI, model_name: str, s
|
|||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||||
|
@pytest.mark.dependency(
|
||||||
|
depends=["test_mcp_code_interpreter_streaming[openai/gpt-oss-20b]"]
|
||||||
|
)
|
||||||
async def test_mcp_tool_multi_turn(client: OpenAI, model_name: str, server):
|
async def test_mcp_tool_multi_turn(client: OpenAI, model_name: str, server):
|
||||||
"""Test MCP tool calling across multiple turns.
|
"""Test MCP tool calling across multiple turns.
|
||||||
|
|
||||||
@@ -1117,8 +1120,10 @@ async def test_function_call_with_previous_input_messages(
|
|||||||
model=model_name,
|
model=model_name,
|
||||||
input="What is the horoscope for Aquarius today?",
|
input="What is the horoscope for Aquarius today?",
|
||||||
tools=tools,
|
tools=tools,
|
||||||
|
temperature=0.0,
|
||||||
extra_body={"enable_response_messages": True},
|
extra_body={"enable_response_messages": True},
|
||||||
stream=True,
|
stream=True,
|
||||||
|
max_output_tokens=1000,
|
||||||
)
|
)
|
||||||
|
|
||||||
response = None
|
response = None
|
||||||
@@ -1170,6 +1175,7 @@ async def test_function_call_with_previous_input_messages(
|
|||||||
stream_response_2 = await client.responses.create(
|
stream_response_2 = await client.responses.create(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
tools=tools,
|
tools=tools,
|
||||||
|
temperature=0.0,
|
||||||
input="",
|
input="",
|
||||||
extra_body={
|
extra_body={
|
||||||
"previous_input_messages": previous_messages,
|
"previous_input_messages": previous_messages,
|
||||||
|
|||||||
@@ -160,6 +160,7 @@ class TestMCPEnabled:
|
|||||||
"No developer messages should be present with valid mcp tool"
|
"No developer messages should be present with valid mcp tool"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@pytest.mark.flaky(reruns=3)
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||||
async def test_mcp_tool_with_allowed_tools_star(
|
async def test_mcp_tool_with_allowed_tools_star(
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ async def client(server):
|
|||||||
async def test_basic(client: OpenAI, model_name: str):
|
async def test_basic(client: OpenAI, model_name: str):
|
||||||
response = await client.responses.create(
|
response = await client.responses.create(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
input="What is 13 * 24?",
|
input="What is 123 * 456?",
|
||||||
)
|
)
|
||||||
assert response is not None
|
assert response is not None
|
||||||
print("response: ", response)
|
print("response: ", response)
|
||||||
@@ -164,7 +164,7 @@ async def test_function_call_first_turn(client: OpenAI, model_name: str):
|
|||||||
async def test_mcp_tool_call(client: OpenAI, model_name: str):
|
async def test_mcp_tool_call(client: OpenAI, model_name: str):
|
||||||
response = await client.responses.create(
|
response = await client.responses.create(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
input="What is 13 * 24? Use python to calculate the result.",
|
input="What is 123 * 456? Use python to calculate the result.",
|
||||||
tools=[{"type": "code_interpreter", "container": {"type": "auto"}}],
|
tools=[{"type": "code_interpreter", "container": {"type": "auto"}}],
|
||||||
extra_body={"enable_response_messages": True},
|
extra_body={"enable_response_messages": True},
|
||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
@@ -179,12 +179,12 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
|
|||||||
assert response.output[2].type == "reasoning"
|
assert response.output[2].type == "reasoning"
|
||||||
# make sure the correct math is in the final output
|
# make sure the correct math is in the final output
|
||||||
assert response.output[3].type == "message"
|
assert response.output[3].type == "message"
|
||||||
assert "312" in response.output[3].content[0].text
|
assert "56088" in response.output[3].content[0].text
|
||||||
|
|
||||||
# test raw input_messages / output_messages
|
# test raw input_messages / output_messages
|
||||||
assert len(response.input_messages) == 1
|
assert len(response.input_messages) == 1
|
||||||
assert len(response.output_messages) == 3
|
assert len(response.output_messages) == 3
|
||||||
assert "312" in response.output_messages[2]["message"]
|
assert "56088" in response.output_messages[2]["message"]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ async def client(server):
|
|||||||
async def test_basic(client: OpenAI, model_name: str):
|
async def test_basic(client: OpenAI, model_name: str):
|
||||||
response = await client.responses.create(
|
response = await client.responses.create(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
input="What is 13 * 24?",
|
input="What is 123 * 456?",
|
||||||
)
|
)
|
||||||
assert response is not None
|
assert response is not None
|
||||||
print("response: ", response)
|
print("response: ", response)
|
||||||
|
|||||||
@@ -197,12 +197,86 @@ class RemoteOpenAIServer:
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_value, traceback):
|
def __exit__(self, exc_type, exc_value, traceback):
|
||||||
|
pid = self.proc.pid
|
||||||
|
# Graceful shutdown
|
||||||
self.proc.terminate()
|
self.proc.terminate()
|
||||||
try:
|
try:
|
||||||
self.proc.wait(8)
|
self.proc.wait(timeout=15)
|
||||||
|
print(f"[RemoteOpenAIServer] Server {pid} terminated gracefully")
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
# force kill if needed
|
print(
|
||||||
|
f"[RemoteOpenAIServer] Server {pid} did not respond "
|
||||||
|
"to SIGTERM, sending SIGKILL"
|
||||||
|
)
|
||||||
self.proc.kill()
|
self.proc.kill()
|
||||||
|
try:
|
||||||
|
self.proc.wait(timeout=5)
|
||||||
|
print(f"[RemoteOpenAIServer] Server {pid} killed")
|
||||||
|
except subprocess.TimeoutExpired as err:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"[RemoteOpenAIServer] Failed to kill server process {pid}"
|
||||||
|
) from err
|
||||||
|
# Wait for GPU memory to be released
|
||||||
|
self._wait_for_gpu_memory_release()
|
||||||
|
|
||||||
|
def _get_gpu_memory_used(self) -> float | None:
|
||||||
|
"""Get total GPU memory used across all visible devices in bytes."""
|
||||||
|
try:
|
||||||
|
if current_platform.is_rocm():
|
||||||
|
with _nvml():
|
||||||
|
handles = amdsmi_get_processor_handles()
|
||||||
|
total_used = 0
|
||||||
|
for handle in handles:
|
||||||
|
vram_info = amdsmi_get_gpu_vram_usage(handle)
|
||||||
|
total_used += vram_info["vram_used"]
|
||||||
|
return total_used
|
||||||
|
elif current_platform.is_cuda():
|
||||||
|
with _nvml():
|
||||||
|
total_used = 0
|
||||||
|
device_count = cuda_device_count_stateless()
|
||||||
|
for i in range(device_count):
|
||||||
|
handle = nvmlDeviceGetHandleByIndex(i)
|
||||||
|
mem_info = nvmlDeviceGetMemoryInfo(handle)
|
||||||
|
total_used += mem_info.used
|
||||||
|
return total_used
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[RemoteOpenAIServer] Could not query GPU memory: {e}")
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _wait_for_gpu_memory_release(self, timeout: float = 30.0):
|
||||||
|
"""Poll GPU memory until it stabilizes, indicating cleanup is complete."""
|
||||||
|
start = time.time()
|
||||||
|
prev_used: float | None = None
|
||||||
|
stable_count = 0
|
||||||
|
|
||||||
|
while time.time() - start < timeout:
|
||||||
|
used = self._get_gpu_memory_used()
|
||||||
|
|
||||||
|
if used is None:
|
||||||
|
return # Can't query, assume ok
|
||||||
|
|
||||||
|
if prev_used is not None and abs(used - prev_used) < 100 * 1024 * 1024:
|
||||||
|
stable_count += 1
|
||||||
|
if stable_count >= 3:
|
||||||
|
used_gb = used / 1e9
|
||||||
|
print(
|
||||||
|
f"[RemoteOpenAIServer] GPU memory stabilized "
|
||||||
|
f"at {used_gb:.2f} GB"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
stable_count = 0
|
||||||
|
|
||||||
|
prev_used = used
|
||||||
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
last_reading = prev_used / 1e9 if prev_used is not None else 0.0
|
||||||
|
raise RuntimeError(
|
||||||
|
f"[RemoteOpenAIServer] GPU memory did not stabilize within {timeout}s. "
|
||||||
|
f"Last reading: {last_reading:.2f} GB. "
|
||||||
|
"Child processes may still be holding GPU memory."
|
||||||
|
)
|
||||||
|
|
||||||
def _poll(self) -> int | None:
|
def _poll(self) -> int | None:
|
||||||
"""Subclasses override this method to customize process polling"""
|
"""Subclasses override this method to customize process polling"""
|
||||||
|
|||||||
@@ -68,6 +68,14 @@ MCP_BUILTIN_TOOLS: set[str] = {
|
|||||||
"container",
|
"container",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Mapping from built-in tool recipient names to their MCP server labels.
|
||||||
|
# This ensures consistency between streaming and non-streaming responses.
|
||||||
|
_BUILTIN_TOOL_TO_MCP_SERVER_LABEL: dict[str, str] = {
|
||||||
|
"python": "code_interpreter",
|
||||||
|
"browser": "web_search_preview",
|
||||||
|
"container": "container",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def has_custom_tools(tool_types: set[str]) -> bool:
|
def has_custom_tools(tool_types: set[str]) -> bool:
|
||||||
"""
|
"""
|
||||||
@@ -601,7 +609,13 @@ def _parse_mcp_recipient(recipient: str) -> tuple[str, str]:
|
|||||||
|
|
||||||
def _parse_mcp_call(message: Message, recipient: str) -> list[ResponseOutputItem]:
|
def _parse_mcp_call(message: Message, recipient: str) -> list[ResponseOutputItem]:
|
||||||
"""Parse MCP calls into MCP call items."""
|
"""Parse MCP calls into MCP call items."""
|
||||||
server_label, tool_name = _parse_mcp_recipient(recipient)
|
# Handle built-in tools that need server_label mapping
|
||||||
|
if recipient in _BUILTIN_TOOL_TO_MCP_SERVER_LABEL:
|
||||||
|
server_label = _BUILTIN_TOOL_TO_MCP_SERVER_LABEL[recipient]
|
||||||
|
tool_name = recipient
|
||||||
|
else:
|
||||||
|
server_label, tool_name = _parse_mcp_recipient(recipient)
|
||||||
|
|
||||||
output_items = []
|
output_items = []
|
||||||
for content in message.content:
|
for content in message.content:
|
||||||
response_item = McpCall(
|
response_item = McpCall(
|
||||||
@@ -630,7 +644,7 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
|
|||||||
recipient = message.recipient
|
recipient = message.recipient
|
||||||
|
|
||||||
if recipient is not None:
|
if recipient is not None:
|
||||||
# Browser tool calls
|
# Browser tool calls (browser.search, browser.open, browser.find)
|
||||||
if recipient.startswith("browser."):
|
if recipient.startswith("browser."):
|
||||||
output_items.append(_parse_browser_tool_call(message, recipient))
|
output_items.append(_parse_browser_tool_call(message, recipient))
|
||||||
|
|
||||||
@@ -638,10 +652,8 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
|
|||||||
elif message.channel == "commentary" and recipient.startswith("functions."):
|
elif message.channel == "commentary" and recipient.startswith("functions."):
|
||||||
output_items.extend(_parse_function_call(message, recipient))
|
output_items.extend(_parse_function_call(message, recipient))
|
||||||
|
|
||||||
# Built-in tools are treated as reasoning
|
# Built-in MCP tools (python, browser, container)
|
||||||
elif recipient.startswith(("python", "browser", "container")):
|
elif recipient in _BUILTIN_TOOL_TO_MCP_SERVER_LABEL:
|
||||||
# Built-in tool recipients (python/browser/container)
|
|
||||||
# generate reasoning output
|
|
||||||
output_items.extend(_parse_reasoning(message))
|
output_items.extend(_parse_reasoning(message))
|
||||||
|
|
||||||
# All other recipients are MCP calls
|
# All other recipients are MCP calls
|
||||||
@@ -688,13 +700,23 @@ def parse_remaining_state(parser: StreamableParser) -> list[ResponseOutputItem]:
|
|||||||
status="in_progress",
|
status="in_progress",
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
# Built-in tools (python, browser, container) should be treated as reasoning
|
# Built-in MCP tools (python, browser, container)
|
||||||
elif not (
|
elif current_recipient in _BUILTIN_TOOL_TO_MCP_SERVER_LABEL:
|
||||||
current_recipient.startswith("python")
|
return [
|
||||||
or current_recipient.startswith("browser")
|
ResponseReasoningItem(
|
||||||
or current_recipient.startswith("container")
|
id=f"rs_{random_uuid()}",
|
||||||
):
|
summary=[],
|
||||||
# All other recipients are MCP calls
|
type="reasoning",
|
||||||
|
content=[
|
||||||
|
ResponseReasoningTextContent(
|
||||||
|
text=parser.current_content, type="reasoning_text"
|
||||||
|
)
|
||||||
|
],
|
||||||
|
status=None,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
# All other recipients are MCP calls
|
||||||
|
else:
|
||||||
rid = random_uuid()
|
rid = random_uuid()
|
||||||
server_label, tool_name = _parse_mcp_recipient(current_recipient)
|
server_label, tool_name = _parse_mcp_recipient(current_recipient)
|
||||||
return [
|
return [
|
||||||
|
|||||||
Reference in New Issue
Block a user