[CI][MCP][Harmony] Heavy refactoring Harmony & MCP response tests and stabilizing with deterministic test infrastructure (#33949)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
This commit is contained in:
Andreas Karatzas
2026-02-20 22:03:32 -06:00
committed by GitHub
parent 5719a4e4e6
commit 991d6bff38
10 changed files with 1187 additions and 886 deletions

View File

@@ -3,15 +3,29 @@
import importlib.util
import json
import logging
import pytest
import pytest_asyncio
from openai import OpenAI
from ....utils import RemoteOpenAIServer
from .conftest import (
BASE_TEST_ENV,
has_output_type,
log_response_diagnostics,
retry_for_tool_call,
)
logger = logging.getLogger(__name__)
MODEL_NAME = "Qwen/Qwen3-8B"
_PYTHON_TOOL_INSTRUCTION = (
"You must use the Python tool to execute code. "
"Never simulate execution. You must print the final answer."
)
@pytest.fixture(scope="module")
def server():
@@ -32,12 +46,12 @@ def server():
"--tool-server",
"demo",
]
env_dict = dict(
VLLM_ENABLE_RESPONSES_API_STORE="1",
VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT="1",
PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
)
env_dict = {
**BASE_TEST_ENV,
"VLLM_ENABLE_RESPONSES_API_STORE": "1",
"VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT": "1",
"PYTHON_EXECUTION_BACKEND": "dangerously_use_uv",
}
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
yield remote_server
@@ -54,6 +68,7 @@ async def test_basic(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="What is 123 * 456?",
temperature=0.0,
)
assert response is not None
print("response: ", response)
@@ -99,10 +114,15 @@ async def test_reasoning_and_function_items(client: OpenAI, model_name: str):
)
assert response is not None
assert response.status == "completed"
# make sure we get a reasoning and text output
assert response.output[0].type == "reasoning"
assert response.output[1].type == "message"
assert type(response.output[1].content[0].text) is str
output_types = [getattr(o, "type", None) for o in response.output]
assert "reasoning" in output_types, (
f"Expected reasoning in output, got: {output_types}"
)
assert "message" in output_types, f"Expected message in output, got: {output_types}"
msg = next(o for o in response.output if o.type == "message")
assert type(msg.content[0].text) is str
def get_horoscope(sign):
@@ -110,10 +130,10 @@ def get_horoscope(sign):
def call_function(name, args):
logger.info("Calling function %s with args %s", name, args)
if name == "get_horoscope":
return get_horoscope(**args)
else:
raise ValueError(f"Unknown function: {name}")
raise ValueError(f"Unknown function: {name}")
@pytest.mark.asyncio
@@ -136,61 +156,111 @@ async def test_function_call_first_turn(client: OpenAI, model_name: str):
}
]
response = await client.responses.create(
response = await retry_for_tool_call(
client,
model=model_name,
expected_tool_type="function_call",
input="What is the horoscope for Aquarius today?",
tools=tools,
temperature=0.0,
)
assert response is not None
assert response.status == "completed"
assert len(response.output) == 2
assert response.output[0].type == "reasoning"
assert response.output[1].type == "function_call"
function_call = response.output[1]
output_types = [getattr(o, "type", None) for o in response.output]
assert "reasoning" in output_types, (
f"Expected reasoning in output, got: {output_types}"
)
assert has_output_type(response, "function_call"), (
f"Expected function_call in output, got: {output_types}"
)
function_call = next(o for o in response.output if o.type == "function_call")
assert function_call.name == "get_horoscope"
assert function_call.call_id is not None
args = json.loads(function_call.arguments)
assert "sign" in args
# the multi turn function call is tested above in
# test_reasoning_and_function_items
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_mcp_tool_call(client: OpenAI, model_name: str):
response = await client.responses.create(
"""MCP tool calling with code_interpreter.
The model may make one or more tool calls before producing a final
message. We validate server invariants (mcp_call items have correct
fields) with hard assertions. Output indices are never hardcoded
since the model can produce multiple tool-call rounds.
"""
# MCP + container init + code execution can be slow
client_with_timeout = client.with_options(timeout=client.timeout * 3)
response = await retry_for_tool_call(
client_with_timeout,
model=model_name,
input="What is 123 * 456? Use python to calculate the result.",
expected_tool_type="mcp_call",
input=(
"What is 123 * 456? Use python to calculate the result. "
"Print the result with print()."
),
tools=[{"type": "code_interpreter", "container": {"type": "auto"}}],
extra_body={"enable_response_messages": True},
instructions=_PYTHON_TOOL_INSTRUCTION,
temperature=0.0,
extra_body={"enable_response_messages": True},
)
assert response is not None
assert response.status == "completed"
# The model may produce multiple reasoning/mcp_call rounds before the
# final message, so validate structurally rather than by exact index.
output_types = [o.type for o in response.output]
assert "reasoning" in output_types
mcp_calls = [o for o in response.output if o.type == "mcp_call"]
assert len(mcp_calls) >= 1
assert type(mcp_calls[0].arguments) is str
assert type(mcp_calls[0].output) is str
output_types = [getattr(o, "type", None) for o in response.output]
log_response_diagnostics(response, label="test_mcp_tool_call")
# The final output should be a message containing the correct answer
assert response.output[-1].type == "message"
assert any(s in response.output[-1].content[0].text for s in ("56088", "56,088"))
assert response.status == "completed", (
f"Response status={response.status} "
f"(details={getattr(response, 'incomplete_details', None)}). "
f"Output types: {output_types}."
)
# Test raw input_messages / output_messages
assert len(response.input_messages) == 1
assert len(response.output_messages) >= 3
assert "reasoning" in output_types, (
f"Expected reasoning in output, got: {output_types}"
)
assert "mcp_call" in output_types, (
f"Expected mcp_call in output, got: {output_types}"
)
# Every mcp_call item must have well-typed fields
for item in response.output:
if getattr(item, "type", None) == "mcp_call":
assert type(item.arguments) is str, (
f"mcp_call.arguments should be str, got {type(item.arguments)}"
)
assert type(item.output) is str, (
f"mcp_call.output should be str, got {type(item.output)}"
)
# The model may make 1+ tool-call rounds but must still produce
# a final message for a trivial calculation like 123 * 456.
message_outputs = [
o for o in response.output if getattr(o, "type", None) == "message"
]
assert message_outputs, (
f"Model did not produce a final message. Output types: {output_types}"
)
final_message = message_outputs[-1]
assert any(s in final_message.content[0].text for s in ("56088", "56,088")), (
f"Expected 56088 in final message, got: {final_message.content[0].text!r}"
)
# Validate raw input_messages / output_messages
assert len(response.input_messages) >= 1, "Expected at least 1 input message"
assert len(response.output_messages) >= 1, "Expected at least 1 output message"
assert any(
s in response.output_messages[-1]["message"] for s in ("56088", "56,088")
any(s in str(msg) for s in ("56088", "56,088"))
for msg in response.output_messages
), (
f"Expected 56088 in at least one output_message, "
f"got {len(response.output_messages)} messages"
)
@@ -202,6 +272,7 @@ async def test_max_tokens(client: OpenAI, model_name: str):
input="What is the first paragraph of Moby Dick?",
reasoning={"effort": "low"},
max_output_tokens=30,
temperature=0.0,
)
assert response is not None
assert response.status == "incomplete"