2025-12-02 08:24:45 -08:00
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
|
|
2026-02-10 13:18:57 +08:00
|
|
|
import importlib.util
|
2025-12-05 08:11:50 -08:00
|
|
|
import json
|
2026-02-20 22:03:32 -06:00
|
|
|
import logging
|
2025-12-02 08:24:45 -08:00
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
import pytest_asyncio
|
|
|
|
|
from openai import OpenAI
|
|
|
|
|
|
2026-03-17 01:14:52 -04:00
|
|
|
from tests.utils import RemoteOpenAIServer
|
|
|
|
|
|
2026-02-20 22:03:32 -06:00
|
|
|
from .conftest import (
|
|
|
|
|
BASE_TEST_ENV,
|
|
|
|
|
has_output_type,
|
|
|
|
|
log_response_diagnostics,
|
|
|
|
|
retry_for_tool_call,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
2025-12-02 08:24:45 -08:00
|
|
|
|
|
|
|
|
MODEL_NAME = "Qwen/Qwen3-8B"
|
|
|
|
|
|
2026-02-20 22:03:32 -06:00
|
|
|
_PYTHON_TOOL_INSTRUCTION = (
|
|
|
|
|
"You must use the Python tool to execute code. "
|
|
|
|
|
"Never simulate execution. You must print the final answer."
|
|
|
|
|
)
|
|
|
|
|
|
2025-12-02 08:24:45 -08:00
|
|
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
|
|
|
def server():
|
2025-12-05 08:11:50 -08:00
|
|
|
assert importlib.util.find_spec("gpt_oss") is not None, (
|
|
|
|
|
"Harmony tests require gpt_oss package to be installed"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
args = [
|
|
|
|
|
"--reasoning-parser",
|
|
|
|
|
"qwen3",
|
|
|
|
|
"--max_model_len",
|
|
|
|
|
"5000",
|
|
|
|
|
"--structured-outputs-config.backend",
|
|
|
|
|
"xgrammar",
|
|
|
|
|
"--enable-auto-tool-choice",
|
|
|
|
|
"--tool-call-parser",
|
|
|
|
|
"hermes",
|
|
|
|
|
"--tool-server",
|
|
|
|
|
"demo",
|
|
|
|
|
]
|
2026-02-20 22:03:32 -06:00
|
|
|
env_dict = {
|
|
|
|
|
**BASE_TEST_ENV,
|
|
|
|
|
"VLLM_ENABLE_RESPONSES_API_STORE": "1",
|
|
|
|
|
"VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT": "1",
|
|
|
|
|
"PYTHON_EXECUTION_BACKEND": "dangerously_use_uv",
|
|
|
|
|
}
|
2025-12-02 08:24:45 -08:00
|
|
|
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
|
|
|
|
|
yield remote_server
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest_asyncio.fixture
|
|
|
|
|
async def client(server):
|
|
|
|
|
async with server.get_async_client() as async_client:
|
|
|
|
|
yield async_client
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
|
|
|
|
async def test_basic(client: OpenAI, model_name: str):
|
|
|
|
|
response = await client.responses.create(
|
|
|
|
|
model=model_name,
|
2026-02-04 21:14:06 -06:00
|
|
|
input="What is 123 * 456?",
|
2026-02-20 22:03:32 -06:00
|
|
|
temperature=0.0,
|
2025-12-02 08:24:45 -08:00
|
|
|
)
|
|
|
|
|
assert response is not None
|
|
|
|
|
print("response: ", response)
|
|
|
|
|
assert response.status == "completed"
|
2026-01-09 16:00:57 -05:00
|
|
|
assert response.incomplete_details is None
|
2025-12-02 08:24:45 -08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
|
|
|
|
async def test_reasoning_and_function_items(client: OpenAI, model_name: str):
|
|
|
|
|
response = await client.responses.create(
|
|
|
|
|
model=model_name,
|
|
|
|
|
input=[
|
|
|
|
|
{"type": "message", "content": "Hello.", "role": "user"},
|
|
|
|
|
{
|
|
|
|
|
"type": "reasoning",
|
|
|
|
|
"id": "lol",
|
|
|
|
|
"content": [
|
|
|
|
|
{
|
|
|
|
|
"type": "reasoning_text",
|
|
|
|
|
"text": "We need to respond: greeting.",
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"summary": [],
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"arguments": '{"location": "Paris", "unit": "celsius"}',
|
|
|
|
|
"call_id": "call_5f7b38f3b81e4b8380fd0ba74f3ca3ab",
|
|
|
|
|
"name": "get_weather",
|
|
|
|
|
"type": "function_call",
|
|
|
|
|
"id": "fc_4fe5d6fc5b6c4d6fa5f24cc80aa27f78",
|
|
|
|
|
"status": "completed",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"call_id": "call_5f7b38f3b81e4b8380fd0ba74f3ca3ab",
|
|
|
|
|
"id": "fc_4fe5d6fc5b6c4d6fa5f24cc80aa27f78",
|
|
|
|
|
"output": "The weather in Paris is 20 Celsius",
|
|
|
|
|
"status": "completed",
|
|
|
|
|
"type": "function_call_output",
|
|
|
|
|
},
|
|
|
|
|
],
|
|
|
|
|
temperature=0.0,
|
|
|
|
|
)
|
|
|
|
|
assert response is not None
|
|
|
|
|
assert response.status == "completed"
|
2026-02-20 22:03:32 -06:00
|
|
|
|
|
|
|
|
output_types = [getattr(o, "type", None) for o in response.output]
|
|
|
|
|
assert "reasoning" in output_types, (
|
|
|
|
|
f"Expected reasoning in output, got: {output_types}"
|
|
|
|
|
)
|
|
|
|
|
assert "message" in output_types, f"Expected message in output, got: {output_types}"
|
|
|
|
|
|
|
|
|
|
msg = next(o for o in response.output if o.type == "message")
|
|
|
|
|
assert type(msg.content[0].text) is str
|
2025-12-05 08:11:50 -08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_horoscope(sign):
|
|
|
|
|
return f"{sign}: Next Tuesday you will befriend a baby otter."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def call_function(name, args):
|
2026-02-20 22:03:32 -06:00
|
|
|
logger.info("Calling function %s with args %s", name, args)
|
2025-12-05 08:11:50 -08:00
|
|
|
if name == "get_horoscope":
|
|
|
|
|
return get_horoscope(**args)
|
2026-02-20 22:03:32 -06:00
|
|
|
raise ValueError(f"Unknown function: {name}")
|
2025-12-05 08:11:50 -08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
|
|
|
|
async def test_function_call_first_turn(client: OpenAI, model_name: str):
|
|
|
|
|
tools = [
|
|
|
|
|
{
|
|
|
|
|
"type": "function",
|
|
|
|
|
"name": "get_horoscope",
|
|
|
|
|
"description": "Get today's horoscope for an astrological sign.",
|
|
|
|
|
"parameters": {
|
|
|
|
|
"type": "object",
|
|
|
|
|
"properties": {
|
|
|
|
|
"sign": {"type": "string"},
|
|
|
|
|
},
|
|
|
|
|
"required": ["sign"],
|
|
|
|
|
"additionalProperties": False,
|
|
|
|
|
},
|
|
|
|
|
"strict": True,
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
|
2026-02-20 22:03:32 -06:00
|
|
|
response = await retry_for_tool_call(
|
|
|
|
|
client,
|
2025-12-05 08:11:50 -08:00
|
|
|
model=model_name,
|
2026-02-20 22:03:32 -06:00
|
|
|
expected_tool_type="function_call",
|
2025-12-05 08:11:50 -08:00
|
|
|
input="What is the horoscope for Aquarius today?",
|
|
|
|
|
tools=tools,
|
|
|
|
|
temperature=0.0,
|
|
|
|
|
)
|
|
|
|
|
assert response is not None
|
|
|
|
|
assert response.status == "completed"
|
|
|
|
|
|
2026-02-20 22:03:32 -06:00
|
|
|
output_types = [getattr(o, "type", None) for o in response.output]
|
|
|
|
|
assert "reasoning" in output_types, (
|
|
|
|
|
f"Expected reasoning in output, got: {output_types}"
|
|
|
|
|
)
|
|
|
|
|
assert has_output_type(response, "function_call"), (
|
|
|
|
|
f"Expected function_call in output, got: {output_types}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
function_call = next(o for o in response.output if o.type == "function_call")
|
2025-12-05 08:11:50 -08:00
|
|
|
assert function_call.name == "get_horoscope"
|
|
|
|
|
assert function_call.call_id is not None
|
|
|
|
|
|
|
|
|
|
args = json.loads(function_call.arguments)
|
|
|
|
|
assert "sign" in args
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
|
|
|
|
async def test_mcp_tool_call(client: OpenAI, model_name: str):
|
2026-02-20 22:03:32 -06:00
|
|
|
"""MCP tool calling with code_interpreter.
|
|
|
|
|
|
|
|
|
|
The model may make one or more tool calls before producing a final
|
|
|
|
|
message. We validate server invariants (mcp_call items have correct
|
|
|
|
|
fields) with hard assertions. Output indices are never hardcoded
|
|
|
|
|
since the model can produce multiple tool-call rounds.
|
|
|
|
|
"""
|
|
|
|
|
# MCP + container init + code execution can be slow
|
|
|
|
|
client_with_timeout = client.with_options(timeout=client.timeout * 3)
|
|
|
|
|
|
|
|
|
|
response = await retry_for_tool_call(
|
|
|
|
|
client_with_timeout,
|
2025-12-05 08:11:50 -08:00
|
|
|
model=model_name,
|
2026-02-20 22:03:32 -06:00
|
|
|
expected_tool_type="mcp_call",
|
|
|
|
|
input=(
|
|
|
|
|
"What is 123 * 456? Use python to calculate the result. "
|
|
|
|
|
"Print the result with print()."
|
|
|
|
|
),
|
2025-12-05 08:11:50 -08:00
|
|
|
tools=[{"type": "code_interpreter", "container": {"type": "auto"}}],
|
2026-02-20 22:03:32 -06:00
|
|
|
instructions=_PYTHON_TOOL_INSTRUCTION,
|
2025-12-05 08:11:50 -08:00
|
|
|
temperature=0.0,
|
2026-02-20 22:03:32 -06:00
|
|
|
extra_body={"enable_response_messages": True},
|
2025-12-05 08:11:50 -08:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert response is not None
|
2026-02-17 13:42:52 -05:00
|
|
|
|
2026-02-20 22:03:32 -06:00
|
|
|
output_types = [getattr(o, "type", None) for o in response.output]
|
|
|
|
|
log_response_diagnostics(response, label="test_mcp_tool_call")
|
|
|
|
|
|
|
|
|
|
assert response.status == "completed", (
|
|
|
|
|
f"Response status={response.status} "
|
|
|
|
|
f"(details={getattr(response, 'incomplete_details', None)}). "
|
|
|
|
|
f"Output types: {output_types}."
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert "reasoning" in output_types, (
|
|
|
|
|
f"Expected reasoning in output, got: {output_types}"
|
|
|
|
|
)
|
|
|
|
|
assert "mcp_call" in output_types, (
|
|
|
|
|
f"Expected mcp_call in output, got: {output_types}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Every mcp_call item must have well-typed fields
|
|
|
|
|
for item in response.output:
|
|
|
|
|
if getattr(item, "type", None) == "mcp_call":
|
|
|
|
|
assert type(item.arguments) is str, (
|
|
|
|
|
f"mcp_call.arguments should be str, got {type(item.arguments)}"
|
|
|
|
|
)
|
|
|
|
|
assert type(item.output) is str, (
|
|
|
|
|
f"mcp_call.output should be str, got {type(item.output)}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# The model may make 1+ tool-call rounds but must still produce
|
|
|
|
|
# a final message for a trivial calculation like 123 * 456.
|
|
|
|
|
message_outputs = [
|
|
|
|
|
o for o in response.output if getattr(o, "type", None) == "message"
|
|
|
|
|
]
|
|
|
|
|
assert message_outputs, (
|
|
|
|
|
f"Model did not produce a final message. Output types: {output_types}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
final_message = message_outputs[-1]
|
|
|
|
|
assert any(s in final_message.content[0].text for s in ("56088", "56,088")), (
|
|
|
|
|
f"Expected 56088 in final message, got: {final_message.content[0].text!r}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Validate raw input_messages / output_messages
|
|
|
|
|
assert len(response.input_messages) >= 1, "Expected at least 1 input message"
|
|
|
|
|
assert len(response.output_messages) >= 1, "Expected at least 1 output message"
|
2026-02-17 13:42:52 -05:00
|
|
|
assert any(
|
2026-02-20 22:03:32 -06:00
|
|
|
any(s in str(msg) for s in ("56088", "56,088"))
|
|
|
|
|
for msg in response.output_messages
|
|
|
|
|
), (
|
|
|
|
|
f"Expected 56088 in at least one output_message, "
|
|
|
|
|
f"got {len(response.output_messages)} messages"
|
2026-02-17 13:42:52 -05:00
|
|
|
)
|
2026-01-09 16:00:57 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
|
|
|
|
async def test_max_tokens(client: OpenAI, model_name: str):
|
|
|
|
|
response = await client.responses.create(
|
|
|
|
|
model=model_name,
|
|
|
|
|
input="What is the first paragraph of Moby Dick?",
|
|
|
|
|
reasoning={"effort": "low"},
|
|
|
|
|
max_output_tokens=30,
|
2026-02-20 22:03:32 -06:00
|
|
|
temperature=0.0,
|
2026-01-09 16:00:57 -05:00
|
|
|
)
|
|
|
|
|
assert response is not None
|
|
|
|
|
assert response.status == "incomplete"
|
|
|
|
|
assert response.incomplete_details.reason == "max_output_tokens"
|