[Bugfix] Fix shared-object aliasing in n>1 streaming with tool calls (#38158)

Signed-off-by: Yifan Zong <yzong@redhat.com> Signed-off-by: Yifan <yzong@redhat.com> Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
2026-03-30 06:12:13 -04:00
parent cc06b4e86b
commit 3683fe6c06
2 changed files with 168 additions and 2 deletions
--- a/tests/entrypoints/openai/chat_completion/test_serving_chat.py
+++ b/tests/entrypoints/openai/chat_completion/test_serving_chat.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
+import json
 from contextlib import suppress
 from dataclasses import dataclass, field
 from typing import Any
@@ -1792,6 +1793,170 @@ async def test_tool_choice_validation_without_parser():
    assert "--tool-call-parser" in response_named.error.message


+@pytest.mark.asyncio
+async def test_streaming_n_gt1_independent_tool_parsers():
+    """n>1 streaming must use independent parser instances
+    and token-id histories per choice.
+    """
+    mock_engine = MagicMock(spec=AsyncLLM)
+    mock_engine.errored = False
+    mock_engine.model_config = MockModelConfig()
+    mock_engine.input_processor = MagicMock()
+    mock_engine.io_processor = MagicMock()
+    mock_engine.renderer = _build_renderer(mock_engine.model_config)
+
+    models = OpenAIServingModels(
+        engine_client=mock_engine,
+        base_model_paths=BASE_MODEL_PATHS,
+    )
+    openai_serving_render = _build_serving_render(mock_engine, models.registry)
+
+    serving_chat = OpenAIServingChat(
+        mock_engine,
+        models,
+        response_role="assistant",
+        openai_serving_render=openai_serving_render,
+        chat_template=CHAT_TEMPLATE,
+        chat_template_content_format="auto",
+        request_logger=None,
+        enable_auto_tools=True,
+        tool_parser="hermes",
+    )
+
+    tokenizer = get_tokenizer(MODEL_NAME)
+
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get weather",
+                "parameters": {
+                    "type": "object",
+                    "properties": {"city": {"type": "string"}},
+                    "required": ["city"],
+                },
+            },
+        }
+    ]
+
+    num_choices = 2
+
+    request = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{"role": "user", "content": "test"}],
+        n=num_choices,
+        stream=True,
+        tools=tools,
+        tool_choice="auto",
+    )
+
+    tool_call_text = (
+        "<tool_call>\n"
+        '{"name": "get_weather", "arguments": {"city": "Tokyo"}}\n'
+        "</tool_call>"
+    )
+    all_token_ids = tokenizer.encode(tool_call_text, add_special_tokens=False)
+
+    # Compute proper delta text for each token so that concatenated deltas
+    # reproduce the original string exactly.
+    steps: list[tuple[str, int]] = []
+    prev_decoded = ""
+    for i, tid in enumerate(all_token_ids):
+        decoded_so_far = tokenizer.decode(all_token_ids[: i + 1])
+        delta = decoded_so_far[len(prev_decoded) :]
+        steps.append((delta, tid))
+        prev_decoded = decoded_so_far
+
+    async def result_generator():
+        for delta_text, token_id in steps:
+            yield RequestOutput(
+                request_id="test-req",
+                prompt="test",
+                prompt_token_ids=[1, 2, 3],
+                prompt_logprobs=None,
+                outputs=[
+                    CompletionOutput(
+                        index=choice_idx,
+                        text=delta_text,
+                        token_ids=[token_id],
+                        cumulative_logprob=0.0,
+                        logprobs=None,
+                    )
+                    for choice_idx in range(num_choices)
+                ],
+                finished=False,
+            )
+        # Final output with finish_reason
+        yield RequestOutput(
+            request_id="test-req",
+            prompt="test",
+            prompt_token_ids=[1, 2, 3],
+            prompt_logprobs=None,
+            outputs=[
+                CompletionOutput(
+                    index=choice_idx,
+                    text="",
+                    token_ids=[],
+                    cumulative_logprob=0.0,
+                    logprobs=None,
+                    finish_reason="stop",
+                )
+                for choice_idx in range(num_choices)
+            ],
+            finished=True,
+        )
+
+    # Collect tool-call deltas per choice from the SSE stream.
+    tc_deltas_by_choice: dict[int, list[dict]] = {i: [] for i in range(num_choices)}
+    async for chunk_str in serving_chat.chat_completion_stream_generator(
+        request=request,
+        result_generator=result_generator(),
+        request_id="test-req",
+        model_name=MODEL_NAME,
+        conversation=[],
+        tokenizer=tokenizer,
+        request_metadata=RequestResponseMetadata(
+            request_id="test-req",
+            model_name=MODEL_NAME,
+        ),
+    ):
+        if not chunk_str.strip() or "data: [DONE]" in chunk_str:
+            continue
+        if chunk_str.startswith("data: "):
+            data = json.loads(chunk_str[6:].strip())
+            for choice in data.get("choices", []):
+                idx = choice["index"]
+                delta = choice.get("delta", {})
+                if delta.get("tool_calls"):
+                    for tc in delta["tool_calls"]:
+                        tc_deltas_by_choice[idx].append(tc)
+
+    # Both choices must independently produce the correct tool call.
+    for choice_idx in range(num_choices):
+        deltas = tc_deltas_by_choice[choice_idx]
+        assert len(deltas) > 0, (
+            f"Choice {choice_idx}: expected tool-call deltas but got none"
+        )
+
+        name = None
+        args_buf = ""
+        for tc in deltas:
+            fn = tc.get("function", {})
+            if fn.get("name"):
+                name = fn["name"]
+            if fn.get("arguments"):
+                args_buf += fn["arguments"]
+
+        assert name == "get_weather", (
+            f"Choice {choice_idx}: expected 'get_weather', got {name!r}"
+        )
+        parsed_args = json.loads(args_buf)
+        assert parsed_args == {"city": "Tokyo"}, (
+            f"Choice {choice_idx}: expected {{'city': 'Tokyo'}}, got {parsed_args}"
+        )
+
+
 class TestCreateRemainingArgsDelta:
    """Tests for _create_remaining_args_delta helper function.

--- a/vllm/entrypoints/openai/chat_completion/serving.py
+++ b/vllm/entrypoints/openai/chat_completion/serving.py
@@ -548,7 +548,7 @@ class OpenAIServingChat(OpenAIServing):
        # all_previous_token_ids will not be used twice in the same iteration.
        if tool_choice_auto or reasoning_parser:
            # These are only required in "auto" tool choice case
-            all_previous_token_ids = [[]] * num_choices
+            all_previous_token_ids = [[] for _ in range(num_choices)]
            # For reasoning parser and tool call all enabled
            added_content_delta_arr = [False] * num_choices
            reasoning_end_arr = [False] * num_choices
@@ -566,7 +566,8 @@ class OpenAIServingChat(OpenAIServing):

                tool_parsers: list[ToolParser | None] = [
                    self.tool_parser(tokenizer, request.tools)
-                ] * num_choices
+                    for _ in range(num_choices)
+                ]
            else:
                tool_parsers = [None] * num_choices
        except Exception as e: