[Bugfix] Fix shared-object aliasing in n>1 streaming with tool calls (#38158)

Signed-off-by: Yifan Zong <yzong@redhat.com>
Signed-off-by: Yifan <yzong@redhat.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
This commit is contained in:
yzong-rh
2026-03-30 06:12:13 -04:00
committed by GitHub
parent cc06b4e86b
commit 3683fe6c06
2 changed files with 168 additions and 2 deletions

View File

@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import json
from contextlib import suppress
from dataclasses import dataclass, field
from typing import Any
@@ -1792,6 +1793,170 @@ async def test_tool_choice_validation_without_parser():
assert "--tool-call-parser" in response_named.error.message
@pytest.mark.asyncio
async def test_streaming_n_gt1_independent_tool_parsers():
"""n>1 streaming must use independent parser instances
and token-id histories per choice.
"""
mock_engine = MagicMock(spec=AsyncLLM)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
mock_engine.renderer = _build_renderer(mock_engine.model_config)
models = OpenAIServingModels(
engine_client=mock_engine,
base_model_paths=BASE_MODEL_PATHS,
)
openai_serving_render = _build_serving_render(mock_engine, models.registry)
serving_chat = OpenAIServingChat(
mock_engine,
models,
response_role="assistant",
openai_serving_render=openai_serving_render,
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
request_logger=None,
enable_auto_tools=True,
tool_parser="hermes",
)
tokenizer = get_tokenizer(MODEL_NAME)
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get weather",
"parameters": {
"type": "object",
"properties": {"city": {"type": "string"}},
"required": ["city"],
},
},
}
]
num_choices = 2
request = ChatCompletionRequest(
model=MODEL_NAME,
messages=[{"role": "user", "content": "test"}],
n=num_choices,
stream=True,
tools=tools,
tool_choice="auto",
)
tool_call_text = (
"<tool_call>\n"
'{"name": "get_weather", "arguments": {"city": "Tokyo"}}\n'
"</tool_call>"
)
all_token_ids = tokenizer.encode(tool_call_text, add_special_tokens=False)
# Compute proper delta text for each token so that concatenated deltas
# reproduce the original string exactly.
steps: list[tuple[str, int]] = []
prev_decoded = ""
for i, tid in enumerate(all_token_ids):
decoded_so_far = tokenizer.decode(all_token_ids[: i + 1])
delta = decoded_so_far[len(prev_decoded) :]
steps.append((delta, tid))
prev_decoded = decoded_so_far
async def result_generator():
for delta_text, token_id in steps:
yield RequestOutput(
request_id="test-req",
prompt="test",
prompt_token_ids=[1, 2, 3],
prompt_logprobs=None,
outputs=[
CompletionOutput(
index=choice_idx,
text=delta_text,
token_ids=[token_id],
cumulative_logprob=0.0,
logprobs=None,
)
for choice_idx in range(num_choices)
],
finished=False,
)
# Final output with finish_reason
yield RequestOutput(
request_id="test-req",
prompt="test",
prompt_token_ids=[1, 2, 3],
prompt_logprobs=None,
outputs=[
CompletionOutput(
index=choice_idx,
text="",
token_ids=[],
cumulative_logprob=0.0,
logprobs=None,
finish_reason="stop",
)
for choice_idx in range(num_choices)
],
finished=True,
)
# Collect tool-call deltas per choice from the SSE stream.
tc_deltas_by_choice: dict[int, list[dict]] = {i: [] for i in range(num_choices)}
async for chunk_str in serving_chat.chat_completion_stream_generator(
request=request,
result_generator=result_generator(),
request_id="test-req",
model_name=MODEL_NAME,
conversation=[],
tokenizer=tokenizer,
request_metadata=RequestResponseMetadata(
request_id="test-req",
model_name=MODEL_NAME,
),
):
if not chunk_str.strip() or "data: [DONE]" in chunk_str:
continue
if chunk_str.startswith("data: "):
data = json.loads(chunk_str[6:].strip())
for choice in data.get("choices", []):
idx = choice["index"]
delta = choice.get("delta", {})
if delta.get("tool_calls"):
for tc in delta["tool_calls"]:
tc_deltas_by_choice[idx].append(tc)
# Both choices must independently produce the correct tool call.
for choice_idx in range(num_choices):
deltas = tc_deltas_by_choice[choice_idx]
assert len(deltas) > 0, (
f"Choice {choice_idx}: expected tool-call deltas but got none"
)
name = None
args_buf = ""
for tc in deltas:
fn = tc.get("function", {})
if fn.get("name"):
name = fn["name"]
if fn.get("arguments"):
args_buf += fn["arguments"]
assert name == "get_weather", (
f"Choice {choice_idx}: expected 'get_weather', got {name!r}"
)
parsed_args = json.loads(args_buf)
assert parsed_args == {"city": "Tokyo"}, (
f"Choice {choice_idx}: expected {{'city': 'Tokyo'}}, got {parsed_args}"
)
class TestCreateRemainingArgsDelta:
"""Tests for _create_remaining_args_delta helper function.

View File

@@ -548,7 +548,7 @@ class OpenAIServingChat(OpenAIServing):
# all_previous_token_ids will not be used twice in the same iteration.
if tool_choice_auto or reasoning_parser:
# These are only required in "auto" tool choice case
all_previous_token_ids = [[]] * num_choices
all_previous_token_ids = [[] for _ in range(num_choices)]
# For reasoning parser and tool call all enabled
added_content_delta_arr = [False] * num_choices
reasoning_end_arr = [False] * num_choices
@@ -566,7 +566,8 @@ class OpenAIServingChat(OpenAIServing):
tool_parsers: list[ToolParser | None] = [
self.tool_parser(tokenizer, request.tools)
] * num_choices
for _ in range(num_choices)
]
else:
tool_parsers = [None] * num_choices
except Exception as e: