[MODEL] Fix handling of multiple channels for gpt-oss with speculative decoding (#26291)

Signed-off-by: Aleksandr Samarin <astrlrd@nebius.com> Signed-off-by: southfreebird <yvorott@gmail.com> Co-authored-by: southfreebird <yvorott@gmail.com>
2026-01-14 21:20:52 +03:00
parent 3a612322eb
commit d084e9fca7
4 changed files with 672 additions and 383 deletions
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -35,6 +35,7 @@ from .utils import (
 )

 GPT_OSS_MODEL_NAME = "openai/gpt-oss-20b"
+GPT_OSS_SPECULATOR_NAME = "RedHatAI/gpt-oss-20b-speculator.eagle3"


@pytest.fixture(scope="module")
@@ -66,7 +67,8 @@ def exclude_tools_when_tool_choice_none(request) -> bool:

@pytest.fixture(scope="module")
 def default_server_args(
-    with_tool_parser: bool, exclude_tools_when_tool_choice_none: bool
+    with_tool_parser: bool,
+    exclude_tools_when_tool_choice_none: bool,
 ):
    args = [
        # use half precision for speed and memory savings in CI environment
@@ -76,7 +78,7 @@ def default_server_args(
        "--reasoning-parser",
        "openai_gptoss",
        "--gpu-memory-utilization",
-        "0.8",
+        "0.85",
    ]
    if with_tool_parser:
        args.extend(
@@ -91,327 +93,385 @@ def default_server_args(
    return args


-@pytest.fixture(scope="module")
+@pytest.fixture(scope="class")
 def gptoss_server(default_server_args: list[str]):
    server_args = default_server_args + ["--attention-backend=TRITON_ATTN"]
    with RemoteOpenAIServer(GPT_OSS_MODEL_NAME, server_args) as remote_server:
        yield remote_server


+@pytest.fixture(scope="class")
+def gptoss_speculative_server(default_server_args: list[str]):
+    server_args = default_server_args + [
+        "--speculative-config",
+        f'{{"model": "{GPT_OSS_SPECULATOR_NAME}", '
+        f'"method": "eagle3", "num_speculative_tokens": 3}}',
+        "--attention-backend=TRITON_ATTN",
+    ]
+    with RemoteOpenAIServer(GPT_OSS_MODEL_NAME, server_args) as remote_server:
+        yield remote_server
+
+
@pytest_asyncio.fixture
 async def gptoss_client(gptoss_server):
    async with gptoss_server.get_async_client() as async_client:
        yield async_client


-@pytest.mark.asyncio
-async def test_gpt_oss_chat_tool_call_streaming(
-    gptoss_client: OpenAI, with_tool_parser: bool
-):
-    tools = [
-        {
-            "type": "function",
-            "function": {
-                "name": "get_current_weather",
-                "description": "Get the current weather in a given location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "city": {"type": "string"},
-                        "state": {"type": "string"},
-                        "unit": {
-                            "type": "string",
-                            "enum": ["celsius", "fahrenheit"],
+@pytest_asyncio.fixture
+async def gptoss_speculative_client(gptoss_speculative_server):
+    async with gptoss_speculative_server.get_async_client() as async_client:
+        yield async_client
+
+
+class TestGPTOSSChat:
+    @pytest.mark.asyncio
+    async def test_gpt_oss_chat_tool_call_streaming(
+        self, gptoss_client: OpenAI, with_tool_parser: bool
+    ):
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_current_weather",
+                    "description": "Get the current weather in a given location",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "city": {"type": "string"},
+                            "state": {"type": "string"},
+                            "unit": {
+                                "type": "string",
+                                "enum": ["celsius", "fahrenheit"],
+                            },
                        },
+                        "required": ["city", "state", "unit"],
                    },
-                    "required": ["city", "state", "unit"],
                },
-            },
-        }
-    ]
+            }
+        ]

-    messages = [
-        {"role": "user", "content": "What is the weather in Dallas, TX?"},
-    ]
+        messages = [
+            {"role": "user", "content": "What is the weather in Dallas, TX?"},
+        ]

-    stream = await gptoss_client.chat.completions.create(
-        model=GPT_OSS_MODEL_NAME,
-        messages=messages,
-        tools=tools if with_tool_parser else None,
-        stream=True,
-    )
-
-    name = None
-    args_buf = ""
-    content_buf = ""
-    async for chunk in stream:
-        delta = chunk.choices[0].delta
-        if delta.tool_calls:
-            tc = delta.tool_calls[0]
-            if tc.function and tc.function.name:
-                name = tc.function.name
-            if tc.function and tc.function.arguments:
-                args_buf += tc.function.arguments
-        if getattr(delta, "content", None):
-            content_buf += delta.content
-    if with_tool_parser:
-        assert name is not None
-        assert len(args_buf) > 0
-    else:
-        assert name is None
-        assert len(args_buf) == 0
-        assert len(content_buf) > 0
-
-
-@pytest.mark.asyncio
-async def test_gpt_oss_multi_turn_chat(gptoss_client: OpenAI, with_tool_parser: bool):
-    if not with_tool_parser:
-        pytest.skip("skip non-tool for multi-turn tests")
-    tools = [
-        {
-            "type": "function",
-            "function": {
-                "name": "get_current_weather",
-                "description": "Get the current weather in a given location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "city": {"type": "string"},
-                        "state": {"type": "string"},
-                        "unit": {
-                            "type": "string",
-                            "enum": ["celsius", "fahrenheit"],
-                        },
-                    },
-                    "required": ["city", "state", "unit"],
-                },
-            },
-        }
-    ]
-
-    messages = [
-        {"role": "system", "content": "you are a helpful assistant"},
-        {"role": "user", "content": "What is the weather in Dallas, TX with celsius?"},
-    ]
-
-    first = await gptoss_client.chat.completions.create(
-        model=GPT_OSS_MODEL_NAME,
-        messages=messages,
-        tools=tools,
-        temperature=0.0,
-    )
-    first_msg = first.choices[0].message
-    assert first_msg.tool_calls is not None and len(first_msg.tool_calls) > 0
-    tc = first_msg.tool_calls[0]
-    assert tc.function is not None and tc.function.name == "get_current_weather"
-    args1 = tc.function.arguments
-    assert args1 is not None and len(args1) > 0
-    assert not first_msg.content
-
-    messages.append({"role": "assistant", "content": args1})
-    messages.append(
-        {"role": "user", "content": "Now convert to celsius and return JSON only"}
-    )
-
-    second = await gptoss_client.chat.completions.create(
-        model=GPT_OSS_MODEL_NAME,
-        messages=messages,
-        tools=tools,
-        temperature=0.0,
-    )
-    second_msg = second.choices[0].message
-    assert (second_msg.content is not None and len(second_msg.content) > 0) or (
-        second_msg.tool_calls is not None and len(second_msg.tool_calls) > 0
-    )
-
-
-@pytest.mark.asyncio
-async def test_gpt_oss_tool_message_array_content(
-    gptoss_client: OpenAI, with_tool_parser: bool
-):
-    """Test that tool messages support both string and array content formats."""
-    if not with_tool_parser:
-        pytest.skip("skip non-tool for array content tests")
-
-    tools = [
-        {
-            "type": "function",
-            "function": {
-                "name": "get_weather",
-                "description": "Get the current weather in a given location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "city": {"type": "string"},
-                        "state": {"type": "string"},
-                    },
-                    "required": ["city", "state"],
-                },
-            },
-        }
-    ]
-
-    # Test 1: Tool message with string content
-    messages_string = [
-        {"role": "user", "content": "What's the weather in Paris?"},
-        {
-            "role": "assistant",
-            "tool_calls": [
-                {
-                    "id": "call_123",
-                    "type": "function",
-                    "function": {
-                        "name": "get_weather",
-                        "arguments": '{"city": "Paris", "state": "TX"}',
-                    },
-                }
-            ],
-        },
-        {"role": "tool", "content": "The weather in Paris, TX is sunny, 22°C"},
-    ]
-
-    response_string = await gptoss_client.chat.completions.create(
-        model=GPT_OSS_MODEL_NAME,
-        messages=messages_string,
-        tools=tools,
-        temperature=0.0,
-    )
-
-    assert response_string is not None
-    assert response_string.choices[0].message is not None
-
-    # Test 2: Tool message with array content
-    messages_array = [
-        {"role": "user", "content": "What's the weather in Dallas?"},
-        {
-            "role": "assistant",
-            "tool_calls": [
-                {
-                    "id": "call_456",
-                    "type": "function",
-                    "function": {
-                        "name": "get_weather",
-                        "arguments": '{"city": "Dallas", "state": "TX"}',
-                    },
-                }
-            ],
-        },
-        {
-            "role": "tool",
-            "content": [
-                {"type": "text", "text": "f2e897a7-2705-4337-8193-2a8f57b81618"}
-            ],
-        },
-    ]
-
-    response_array = await gptoss_client.chat.completions.create(
-        model=GPT_OSS_MODEL_NAME,
-        messages=messages_array,
-        tools=tools,
-        temperature=0.0,
-    )
-
-    assert response_array is not None
-    assert response_array.choices[0].message is not None
-
-    # Test 3: Tool message with multiple array content items
-    messages_multi_array = [
-        {"role": "user", "content": "Search for information"},
-        {
-            "role": "assistant",
-            "tool_calls": [
-                {
-                    "id": "call_789",
-                    "type": "function",
-                    "function": {
-                        "name": "get_weather",
-                        "arguments": '{"city": "Austin", "state": "TX"}',
-                    },
-                }
-            ],
-        },
-        {
-            "role": "tool",
-            "content": [
-                {"type": "text", "text": "Weather data: "},
-                {"type": "text", "text": "Austin, TX - Partly cloudy, 25°C"},
-                {"type": "text", "text": " with 60% humidity"},
-            ],
-        },
-    ]
-
-    response_multi_array = await gptoss_client.chat.completions.create(
-        model=GPT_OSS_MODEL_NAME,
-        messages=messages_multi_array,
-        tools=tools,
-        temperature=0.0,
-    )
-
-    assert response_multi_array is not None
-    assert response_multi_array.choices[0].message is not None
-
-
-@pytest.mark.asyncio
-async def test_gpt_oss_tool_choice_none(
-    gptoss_client: OpenAI,
-    with_tool_parser: bool,
-    exclude_tools_when_tool_choice_none: bool,
-):
-    if not (with_tool_parser and exclude_tools_when_tool_choice_none):
-        pytest.skip(
-            "skip tool_choice tests when non-tool or "
-            "--exclude-tools-when-tool-choice-none not set"
+        stream = await gptoss_client.chat.completions.create(
+            model=GPT_OSS_MODEL_NAME,
+            messages=messages,
+            tools=tools if with_tool_parser else None,
+            stream=True,
        )

-    tools = [
-        {
-            "type": "function",
-            "function": {
-                "name": "get_current_weather",
-                "description": "Get the current weather in a given location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "city": {"type": "string"},
-                        "state": {"type": "string"},
-                        "unit": {
-                            "type": "string",
-                            "enum": ["celsius", "fahrenheit"],
+        name = None
+        args_buf = ""
+        content_buf = ""
+        async for chunk in stream:
+            delta = chunk.choices[0].delta
+            if delta.tool_calls:
+                tc = delta.tool_calls[0]
+                if tc.function and tc.function.name:
+                    name = tc.function.name
+                if tc.function and tc.function.arguments:
+                    args_buf += tc.function.arguments
+            if getattr(delta, "content", None):
+                content_buf += delta.content
+        if with_tool_parser:
+            assert name is not None
+            assert len(args_buf) > 0
+        else:
+            assert name is None
+            assert len(args_buf) == 0
+            assert len(content_buf) > 0
+
+    @pytest.mark.asyncio
+    async def test_gpt_oss_multi_turn_chat(
+        self, gptoss_client: OpenAI, with_tool_parser: bool
+    ):
+        if not with_tool_parser:
+            pytest.skip("skip non-tool for multi-turn tests")
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_current_weather",
+                    "description": "Get the current weather in a given location",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "city": {"type": "string"},
+                            "state": {"type": "string"},
+                            "unit": {
+                                "type": "string",
+                                "enum": ["celsius", "fahrenheit"],
+                            },
                        },
+                        "required": ["city", "state", "unit"],
                    },
-                    "required": ["city", "state", "unit"],
                },
+            }
+        ]
+
+        messages = [
+            {"role": "system", "content": "you are a helpful assistant"},
+            {
+                "role": "user",
+                "content": "What is the weather in Dallas, TX with celsius?",
            },
-        }
-    ]
+        ]

-    messages = [
-        {
-            "role": "user",
-            "content": "What's the temperature(in degrees Celsius) in Dallas?",
-        },
-    ]
+        first = await gptoss_client.chat.completions.create(
+            model=GPT_OSS_MODEL_NAME,
+            messages=messages,
+            tools=tools,
+            temperature=0.0,
+        )
+        first_msg = first.choices[0].message
+        assert first_msg.tool_calls is not None and len(first_msg.tool_calls) > 0
+        tc = first_msg.tool_calls[0]
+        assert tc.function is not None and tc.function.name == "get_current_weather"
+        args1 = tc.function.arguments
+        assert args1 is not None and len(args1) > 0
+        assert not first_msg.content

-    tool_choice_auto = await gptoss_client.chat.completions.create(
-        model=GPT_OSS_MODEL_NAME,
-        messages=messages,
-        tools=tools,
-        tool_choice="auto",
-        temperature=0.0,
-    )
-    msg = tool_choice_auto.choices[0].message
-    assert len(msg.tool_calls) == 1
+        messages.append({"role": "assistant", "content": args1})
+        messages.append(
+            {"role": "user", "content": "Now convert to celsius and return JSON only"}
+        )

-    tool_choice_none = await gptoss_client.chat.completions.create(
-        model=GPT_OSS_MODEL_NAME,
-        messages=messages,
-        tools=tools,
-        tool_choice="none",
-        temperature=0.0,
-    )
+        second = await gptoss_client.chat.completions.create(
+            model=GPT_OSS_MODEL_NAME,
+            messages=messages,
+            tools=tools,
+            temperature=0.0,
+        )
+        second_msg = second.choices[0].message
+        assert (second_msg.content is not None and len(second_msg.content) > 0) or (
+            second_msg.tool_calls is not None and len(second_msg.tool_calls) > 0
+        )

-    msg = tool_choice_none.choices[0].message
-    assert len(msg.tool_calls) == 0
+    @pytest.mark.asyncio
+    async def test_gpt_oss_tool_message_array_content(
+        self, gptoss_client: OpenAI, with_tool_parser: bool
+    ):
+        """Test that tool messages support both string and array content formats."""
+        if not with_tool_parser:
+            pytest.skip("skip non-tool for array content tests")
+
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_weather",
+                    "description": "Get the current weather in a given location",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "city": {"type": "string"},
+                            "state": {"type": "string"},
+                        },
+                        "required": ["city", "state"],
+                    },
+                },
+            }
+        ]
+
+        # Test 1: Tool message with string content
+        messages_string = [
+            {"role": "user", "content": "What's the weather in Paris?"},
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": "call_123",
+                        "type": "function",
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": '{"city": "Paris", "state": "TX"}',
+                        },
+                    }
+                ],
+            },
+            {"role": "tool", "content": "The weather in Paris, TX is sunny, 22°C"},
+        ]
+
+        response_string = await gptoss_client.chat.completions.create(
+            model=GPT_OSS_MODEL_NAME,
+            messages=messages_string,
+            tools=tools,
+            temperature=0.0,
+        )
+
+        assert response_string is not None
+        assert response_string.choices[0].message is not None
+
+        # Test 2: Tool message with array content
+        messages_array = [
+            {"role": "user", "content": "What's the weather in Dallas?"},
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": "call_456",
+                        "type": "function",
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": '{"city": "Dallas", "state": "TX"}',
+                        },
+                    }
+                ],
+            },
+            {
+                "role": "tool",
+                "content": [
+                    {"type": "text", "text": "f2e897a7-2705-4337-8193-2a8f57b81618"}
+                ],
+            },
+        ]
+
+        response_array = await gptoss_client.chat.completions.create(
+            model=GPT_OSS_MODEL_NAME,
+            messages=messages_array,
+            tools=tools,
+            temperature=0.0,
+        )
+
+        assert response_array is not None
+        assert response_array.choices[0].message is not None
+
+        # Test 3: Tool message with multiple array content items
+        messages_multi_array = [
+            {"role": "user", "content": "Search for information"},
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": "call_789",
+                        "type": "function",
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": '{"city": "Austin", "state": "TX"}',
+                        },
+                    }
+                ],
+            },
+            {
+                "role": "tool",
+                "content": [
+                    {"type": "text", "text": "Weather data: "},
+                    {"type": "text", "text": "Austin, TX - Partly cloudy, 25°C"},
+                    {"type": "text", "text": " with 60% humidity"},
+                ],
+            },
+        ]
+
+        response_multi_array = await gptoss_client.chat.completions.create(
+            model=GPT_OSS_MODEL_NAME,
+            messages=messages_multi_array,
+            tools=tools,
+            temperature=0.0,
+        )
+
+        assert response_multi_array is not None
+        assert response_multi_array.choices[0].message is not None
+
+    @pytest.mark.asyncio
+    async def test_gpt_oss_tool_choice_none(
+        self,
+        gptoss_client: OpenAI,
+        with_tool_parser: bool,
+        exclude_tools_when_tool_choice_none: bool,
+    ):
+        if not (with_tool_parser and exclude_tools_when_tool_choice_none):
+            pytest.skip(
+                "skip tool_choice tests when non-tool or "
+                "--exclude-tools-when-tool-choice-none not set"
+            )
+
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_current_weather",
+                    "description": "Get the current weather in a given location",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "city": {"type": "string"},
+                            "state": {"type": "string"},
+                            "unit": {
+                                "type": "string",
+                                "enum": ["celsius", "fahrenheit"],
+                            },
+                        },
+                        "required": ["city", "state", "unit"],
+                    },
+                },
+            }
+        ]
+
+        messages = [
+            {
+                "role": "user",
+                "content": "What's the temperature(in degrees Celsius) in Dallas?",
+            },
+        ]
+
+        tool_choice_auto = await gptoss_client.chat.completions.create(
+            model=GPT_OSS_MODEL_NAME,
+            messages=messages,
+            tools=tools,
+            tool_choice="auto",
+            temperature=0.0,
+        )
+        msg = tool_choice_auto.choices[0].message
+        assert len(msg.tool_calls) == 1
+
+        tool_choice_none = await gptoss_client.chat.completions.create(
+            model=GPT_OSS_MODEL_NAME,
+            messages=messages,
+            tools=tools,
+            tool_choice="none",
+            temperature=0.0,
+        )
+
+        msg = tool_choice_none.choices[0].message
+        assert len(msg.tool_calls) == 0
+
+
+class TestGPTOSSSpeculativeChat:
+    @pytest.mark.asyncio
+    async def test_gpt_oss_speculative_reasoning_leakage(
+        self,
+        gptoss_speculative_client: OpenAI,
+        with_tool_parser: bool,
+    ):
+        if not with_tool_parser:
+            pytest.skip("skip non-tool for array content tests")
+
+        messages = [
+            {"role": "user", "content": "Calculate 2+2. Return the answer 4 only."},
+        ]
+
+        stream = await gptoss_speculative_client.chat.completions.create(
+            model=GPT_OSS_MODEL_NAME,
+            messages=messages,
+            stream=True,
+            temperature=0.0,
+        )
+
+        content = ""
+        reasoning_content = ""
+        async for chunk in stream:
+            delta = chunk.choices[0].delta
+            if delta.content:
+                content += delta.content
+
+            chunk_reasoning = getattr(delta, "reasoning", None)
+            if chunk_reasoning:
+                reasoning_content += delta.reasoning
+
+        assert len(reasoning_content) > 0, "No reasoning was generated."
+        assert content.strip() == "4"


 MODEL_NAME = "openai-community/gpt2"
--- a/tests/entrypoints/openai/test_serving_chat_stream_harmony.py
+++ b/tests/entrypoints/openai/test_serving_chat_stream_harmony.py
@@ -10,6 +10,7 @@ from unittest.mock import patch
 import pytest

 from vllm.entrypoints.openai.chat_completion.stream_harmony import (
+    TokenState,
    extract_harmony_streaming_delta,
 )

@@ -42,12 +43,14 @@ class TestExtractHarmonyStreamingDelta:
    def test_final_channel_returns_content_delta(self, delta_text, expected_content):
        """Test that final channel returns a DeltaMessage with content."""
        parser = MockStreamableParser()
+
+        # Updated to use TokenState list
+        token_states = [TokenState(channel="final", recipient=None, text=delta_text)]
+
        delta_message, tools_streamed = extract_harmony_streaming_delta(
            harmony_parser=parser,
-            cur_channel="final",
-            cur_recipient=None,
+            token_states=token_states,
            prev_recipient=None,
-            delta_text=delta_text,
            include_reasoning=False,
        )

@@ -65,18 +68,19 @@ class TestExtractHarmonyStreamingDelta:
    def test_analysis_channel_reasoning(self, include_reasoning, expected_has_message):
        """Test analysis channel respects include_reasoning flag."""
        parser = MockStreamableParser()
+        text = "Let me think..."
+        token_states = [TokenState(channel="analysis", recipient=None, text=text)]
+
        delta_message, tools_streamed = extract_harmony_streaming_delta(
            harmony_parser=parser,
-            cur_channel="analysis",
-            cur_recipient=None,
+            token_states=token_states,
            prev_recipient=None,
-            delta_text="Let me think...",
            include_reasoning=include_reasoning,
        )

        if expected_has_message:
            assert delta_message is not None
-            assert delta_message.reasoning == "Let me think..."
+            assert delta_message.reasoning == text
        else:
            assert delta_message is None
        assert tools_streamed is False
@@ -88,12 +92,14 @@ class TestExtractHarmonyStreamingDelta:
        mock_make_tool_call_id.return_value = "call_test123"
        parser = MockStreamableParser()

+        token_states = [
+            TokenState(channel=channel, recipient="functions.get_weather", text="")
+        ]
+
        delta_message, tools_streamed = extract_harmony_streaming_delta(
            harmony_parser=parser,
-            cur_channel=channel,
-            cur_recipient="functions.get_weather",
+            token_states=token_states,
            prev_recipient=None,
-            delta_text="",
            include_reasoning=False,
        )

@@ -111,20 +117,25 @@ class TestExtractHarmonyStreamingDelta:
    def test_tool_call_argument_streaming(self, channel):
        """Test streaming tool call arguments (same recipient)."""
        parser = MockStreamableParser()
+        args_text = '{"location": "Paris"}'
+
+        token_states = [
+            TokenState(
+                channel=channel, recipient="functions.get_weather", text=args_text
+            )
+        ]

        delta_message, tools_streamed = extract_harmony_streaming_delta(
            harmony_parser=parser,
-            cur_channel=channel,
-            cur_recipient="functions.get_weather",
+            token_states=token_states,
            prev_recipient="functions.get_weather",
-            delta_text='{"location": "Paris"}',
            include_reasoning=False,
        )

        assert delta_message is not None
        tool_call = delta_message.tool_calls[0]
        assert tool_call.id is None
-        assert tool_call.function.arguments == '{"location": "Paris"}'
+        assert tool_call.function.arguments == args_text
        assert tool_call.index == 0
        assert tools_streamed is True

@@ -133,12 +144,14 @@ class TestExtractHarmonyStreamingDelta:
        """Test empty delta_text with same recipient returns None."""
        parser = MockStreamableParser()

+        token_states = [
+            TokenState(channel=channel, recipient="functions.get_weather", text="")
+        ]
+
        delta_message, tools_streamed = extract_harmony_streaming_delta(
            harmony_parser=parser,
-            cur_channel=channel,
-            cur_recipient="functions.get_weather",
+            token_states=token_states,
            prev_recipient="functions.get_weather",
-            delta_text="",
            include_reasoning=False,
        )

@@ -154,12 +167,14 @@ class TestExtractHarmonyStreamingDelta:
        ]
        parser = MockStreamableParser(messages=messages)

+        token_states = [
+            TokenState(channel="commentary", recipient="functions.tool2", text="args")
+        ]
+
        delta_message, _ = extract_harmony_streaming_delta(
            harmony_parser=parser,
-            cur_channel="commentary",
-            cur_recipient="functions.tool2",
+            token_states=token_states,
            prev_recipient="functions.tool2",
-            delta_text="args",
            include_reasoning=False,
        )

@@ -173,15 +188,18 @@ class TestExtractHarmonyStreamingDelta:
        ],
    )
    def test_returns_tool_call_preambles(self, channel, recipient):
-        """Test that invalid channel/recipient combinations return None."""
+        """Test that invalid tool recipient on commentary is treated as content."""
        parser = MockStreamableParser()
        delta_text = "some text"
+
+        token_states = [
+            TokenState(channel=channel, recipient=recipient, text=delta_text)
+        ]
+
        delta_message, tools_streamed = extract_harmony_streaming_delta(
            harmony_parser=parser,
-            cur_channel=channel,
-            cur_recipient=recipient,
+            token_states=token_states,
            prev_recipient=None,
-            delta_text=delta_text,
            include_reasoning=True,
        )

@@ -199,14 +217,140 @@ class TestExtractHarmonyStreamingDelta:
        """Test that invalid channel/recipient combinations return None."""
        parser = MockStreamableParser()

+        token_states = [
+            TokenState(channel=channel, recipient=recipient, text="some text")
+        ]
+
        delta_message, tools_streamed = extract_harmony_streaming_delta(
            harmony_parser=parser,
-            cur_channel=channel,
-            cur_recipient=recipient,
+            token_states=token_states,
            prev_recipient=None,
-            delta_text="some text",
            include_reasoning=True,
        )

        assert delta_message is None
        assert tools_streamed is False
+
+    def test_consecutive_token_grouping(self):
+        """
+        Test that consecutive tokens with the same channel/recipient
+        are merged into a single processing group.
+        """
+        parser = MockStreamableParser()
+        token_states = [
+            TokenState("final", None, "H"),
+            TokenState("final", None, "el"),
+            TokenState("final", None, "lo"),
+            TokenState("final", None, ","),
+            TokenState("final", None, " World"),
+        ]
+
+        delta_message, _ = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            token_states=token_states,
+            prev_recipient=None,
+            include_reasoning=False,
+        )
+
+        assert delta_message is not None
+        assert delta_message.content == "Hello, World"
+
+    @patch("vllm.entrypoints.openai.chat_completion.stream_harmony.make_tool_call_id")
+    def test_complex_batch_permutation(self, mock_make_id):
+        """
+        Test a complex permutation: Reasoning -> Tool Call -> Content.
+        This verifies that multiple distinct actions in one batch
+        are all captured in the single DeltaMessage.
+        """
+        mock_make_id.return_value = "call_batch_test"
+        parser = MockStreamableParser()
+
+        token_states = [
+            # 1. Reasoning
+            TokenState("analysis", None, "Reasoning about query..."),
+            # 2. Tool Calling
+            TokenState("commentary", "functions.search", '{"query":'),
+            TokenState("commentary", "functions.search", ' "vllm"}'),
+            # 3. Final Content
+            TokenState("final", None, "."),
+        ]
+
+        delta_message, tools_streamed = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            token_states=token_states,
+            prev_recipient=None,
+            include_reasoning=True,
+        )
+
+        assert delta_message is not None
+
+        assert delta_message.reasoning == "Reasoning about query..."
+
+        # We expect 2 objects for 1 logical tool call:
+        # 1. The definition (id, name, type)
+        # 2. The arguments payload
+        assert len(delta_message.tool_calls) == 2
+
+        header = delta_message.tool_calls[0]
+        payload = delta_message.tool_calls[1]
+
+        assert header.function.name == "search"
+        assert header.id == "call_batch_test"
+        assert header.index == 0
+
+        assert payload.index == 0
+        assert payload.function.arguments == '{"query": "vllm"}'
+
+        assert delta_message.content == "."
+        assert tools_streamed is True
+
+    @patch("vllm.entrypoints.openai.chat_completion.stream_harmony.make_tool_call_id")
+    def test_tool_call_index_consistency_with_ongoing_call(self, mock_make_id):
+        """
+        Test that an ongoing tool call continuation and subsequent new calls
+        maintain correct indexing when interleaved with content.
+        """
+        mock_make_id.side_effect = ["id_b", "id_c"]
+
+        messages = [
+            MockMessage(channel="commentary", recipient="functions.previous_tool")
+        ]
+        parser = MockStreamableParser(messages=messages)
+
+        token_states = [
+            TokenState("commentary", "functions.tool_a", '{"key_a": "val_a"}'),
+            TokenState("final", None, "Thinking..."),
+            TokenState("commentary", "functions.tool_b", '{"key_b": "val_b"}'),
+            TokenState("final", None, " Thinking again..."),
+            TokenState("commentary", "functions.tool_c", '{"key_c": "val_c"}'),
+        ]
+
+        delta_message, _ = extract_harmony_streaming_delta(
+            harmony_parser=parser,
+            token_states=token_states,
+            prev_recipient="functions.tool_a",
+            include_reasoning=False,
+        )
+
+        assert delta_message is not None
+
+        tool_a_deltas = [t for t in delta_message.tool_calls if t.index == 1]
+        assert len(tool_a_deltas) > 0
+        assert tool_a_deltas[0].id is None
+        assert tool_a_deltas[0].function.arguments == '{"key_a": "val_a"}'
+
+        tool_b_header = next(t for t in delta_message.tool_calls if t.id == "id_b")
+        assert tool_b_header.index == 2
+        tool_b_args = next(
+            t for t in delta_message.tool_calls if t.index == 2 and t.id is None
+        )
+        assert tool_b_args.function.arguments == '{"key_b": "val_b"}'
+
+        tool_c_start = next(t for t in delta_message.tool_calls if t.id == "id_c")
+        assert tool_c_start.index == 3
+        tool_c_args = next(
+            t for t in delta_message.tool_calls if t.index == 3 and t.id is None
+        )
+        assert tool_c_args.function.arguments == '{"key_c": "val_c"}'
+
+        assert delta_message.content == "Thinking... Thinking again..."