[Bugfix] accept redacted thinking blocks in Anthropic messages (#36992)
Signed-off-by: Benjamin Bartels <benjaminba@tiglab-ubuntu.ilab.local> Signed-off-by: bbartels <benjamin@bartels.dev> Co-authored-by: Benjamin Bartels <benjaminba@tiglab-ubuntu.ilab.local>
This commit is contained in:
@@ -4,6 +4,9 @@
|
|||||||
|
|
||||||
Tests the image source handling and tool_result content parsing in
|
Tests the image source handling and tool_result content parsing in
|
||||||
AnthropicServingMessages._convert_anthropic_to_openai_request().
|
AnthropicServingMessages._convert_anthropic_to_openai_request().
|
||||||
|
|
||||||
|
Also covers extended-thinking edge cases such as ``redacted_thinking``
|
||||||
|
blocks echoed back by Anthropic clients.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from vllm.entrypoints.anthropic.protocol import (
|
from vllm.entrypoints.anthropic.protocol import (
|
||||||
@@ -373,3 +376,262 @@ class TestAttributionHeaderStripping:
|
|||||||
result = _convert(request)
|
result = _convert(request)
|
||||||
system_msg = result.messages[0]
|
system_msg = result.messages[0]
|
||||||
assert system_msg["content"] == "You are a helpful assistant."
|
assert system_msg["content"] == "You are a helpful assistant."
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================================
|
||||||
|
# Thinking block conversion (Anthropic → OpenAI)
|
||||||
|
# ======================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class TestThinkingBlockConversion:
|
||||||
|
"""Verify that thinking blocks in assistant messages are correctly
|
||||||
|
moved to the ``reasoning`` field and stripped from ``content`` during
|
||||||
|
the Anthropic→OpenAI conversion.
|
||||||
|
|
||||||
|
This is the Anthropic-endpoint path: the client echoes back the full
|
||||||
|
assistant message (including thinking blocks emitted by vllm) in
|
||||||
|
subsequent requests.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_thinking_plus_text_in_assistant_message(self):
|
||||||
|
"""thinking + text → reasoning field + plain-string content."""
|
||||||
|
request = _make_request(
|
||||||
|
[
|
||||||
|
{"role": "user", "content": "Write me some code."},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "thinking",
|
||||||
|
"thinking": "I should write a simple example.",
|
||||||
|
"signature": "sig_abc123",
|
||||||
|
},
|
||||||
|
{"type": "text", "text": "Sure! Here is the code."},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{"role": "user", "content": "Can you fix the bug?"},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
result = _convert(request)
|
||||||
|
|
||||||
|
# Find the assistant message in the converted output.
|
||||||
|
asst_msgs = [m for m in result.messages if m.get("role") == "assistant"]
|
||||||
|
assert len(asst_msgs) == 1
|
||||||
|
asst = asst_msgs[0]
|
||||||
|
|
||||||
|
# Thinking content must be in reasoning, NOT in content.
|
||||||
|
assert asst.get("reasoning") == "I should write a simple example."
|
||||||
|
assert asst.get("content") == "Sure! Here is the code."
|
||||||
|
|
||||||
|
def test_thinking_only_in_assistant_message(self):
|
||||||
|
"""Assistant message with only a thinking block (no visible text).
|
||||||
|
|
||||||
|
This can happen when the model emits reasoning but no final answer
|
||||||
|
yet (e.g. a mid-turn reasoning step). Content should be None.
|
||||||
|
"""
|
||||||
|
request = _make_request(
|
||||||
|
[
|
||||||
|
{"role": "user", "content": "Hello"},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "thinking",
|
||||||
|
"thinking": "Just thinking...",
|
||||||
|
"signature": "sig_xyz",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{"role": "user", "content": "Go on."},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
result = _convert(request)
|
||||||
|
|
||||||
|
asst_msgs = [m for m in result.messages if m.get("role") == "assistant"]
|
||||||
|
assert len(asst_msgs) == 1
|
||||||
|
asst = asst_msgs[0]
|
||||||
|
|
||||||
|
assert asst.get("reasoning") == "Just thinking..."
|
||||||
|
# No visible text → content should be absent or None.
|
||||||
|
assert asst.get("content") is None
|
||||||
|
|
||||||
|
def test_thinking_plus_tool_use_in_assistant_message(self):
|
||||||
|
"""thinking + tool_use: reasoning field set, tool_calls populated."""
|
||||||
|
request = _make_request(
|
||||||
|
[
|
||||||
|
{"role": "user", "content": "What is 2+2?"},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "thinking",
|
||||||
|
"thinking": "I need to call the calculator.",
|
||||||
|
"signature": "sig_tool",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "tool_use",
|
||||||
|
"id": "call_001",
|
||||||
|
"name": "calculator",
|
||||||
|
"input": {"expression": "2+2"},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "tool_result",
|
||||||
|
"tool_use_id": "call_001",
|
||||||
|
"content": "4",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
result = _convert(request)
|
||||||
|
|
||||||
|
asst_msgs = [m for m in result.messages if m.get("role") == "assistant"]
|
||||||
|
assert len(asst_msgs) == 1
|
||||||
|
asst = asst_msgs[0]
|
||||||
|
|
||||||
|
assert asst.get("reasoning") == "I need to call the calculator."
|
||||||
|
tool_calls = list(asst.get("tool_calls", []))
|
||||||
|
assert len(tool_calls) == 1
|
||||||
|
assert tool_calls[0]["function"]["name"] == "calculator"
|
||||||
|
# No text content alongside reasoning + tool_use.
|
||||||
|
assert asst.get("content") is None
|
||||||
|
|
||||||
|
def test_multiple_thinking_blocks_concatenated(self):
|
||||||
|
"""Multiple thinking blocks should be joined in order."""
|
||||||
|
request = _make_request(
|
||||||
|
[
|
||||||
|
{"role": "user", "content": "Think hard."},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "thinking",
|
||||||
|
"thinking": "First thought. ",
|
||||||
|
"signature": "s1",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "thinking",
|
||||||
|
"thinking": "Second thought.",
|
||||||
|
"signature": "s2",
|
||||||
|
},
|
||||||
|
{"type": "text", "text": "Done."},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
result = _convert(request)
|
||||||
|
|
||||||
|
asst_msgs = [m for m in result.messages if m.get("role") == "assistant"]
|
||||||
|
assert len(asst_msgs) == 1
|
||||||
|
asst = asst_msgs[0]
|
||||||
|
|
||||||
|
assert asst.get("reasoning") == "First thought. Second thought."
|
||||||
|
assert asst.get("content") == "Done."
|
||||||
|
|
||||||
|
def test_no_thinking_blocks_unchanged(self):
|
||||||
|
"""Messages without thinking blocks must not be modified."""
|
||||||
|
request = _make_request(
|
||||||
|
[
|
||||||
|
{"role": "user", "content": "Hi"},
|
||||||
|
{"role": "assistant", "content": "Hello!"},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
result = _convert(request)
|
||||||
|
|
||||||
|
asst_msgs = [m for m in result.messages if m.get("role") == "assistant"]
|
||||||
|
assert len(asst_msgs) == 1
|
||||||
|
asst = asst_msgs[0]
|
||||||
|
|
||||||
|
assert asst.get("content") == "Hello!"
|
||||||
|
assert "reasoning" not in asst
|
||||||
|
|
||||||
|
def test_multi_turn_with_thinking_blocks(self):
|
||||||
|
"""Full multi-turn conversation: previous assistant messages that
|
||||||
|
include thinking blocks must all be converted without a 400 error.
|
||||||
|
|
||||||
|
This is the primary regression scenario from the bug report:
|
||||||
|
upgrading vllm from v0.15.1 → v0.17.0 introduced thinking-block
|
||||||
|
support in responses, but echoing those responses back in subsequent
|
||||||
|
requests caused a Pydantic validation failure.
|
||||||
|
"""
|
||||||
|
request = _make_request(
|
||||||
|
[
|
||||||
|
{"role": "user", "content": "Turn 1 question"},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "thinking",
|
||||||
|
"thinking": "Reasoning for turn 1.",
|
||||||
|
"signature": "s_t1",
|
||||||
|
},
|
||||||
|
{"type": "text", "text": "Answer for turn 1."},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{"role": "user", "content": "Turn 2 question"},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "thinking",
|
||||||
|
"thinking": "Reasoning for turn 2.",
|
||||||
|
"signature": "s_t2",
|
||||||
|
},
|
||||||
|
{"type": "text", "text": "Answer for turn 2."},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{"role": "user", "content": "Turn 3 question"},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
# Must not raise a ValidationError / 400.
|
||||||
|
result = _convert(request)
|
||||||
|
|
||||||
|
asst_msgs = [m for m in result.messages if m.get("role") == "assistant"]
|
||||||
|
assert len(asst_msgs) == 2
|
||||||
|
|
||||||
|
assert asst_msgs[0].get("reasoning") == "Reasoning for turn 1."
|
||||||
|
assert asst_msgs[0].get("content") == "Answer for turn 1."
|
||||||
|
assert asst_msgs[1].get("reasoning") == "Reasoning for turn 2."
|
||||||
|
assert asst_msgs[1].get("content") == "Answer for turn 2."
|
||||||
|
|
||||||
|
def test_redacted_thinking_block_is_accepted(self):
|
||||||
|
"""Anthropic clients may echo back redacted thinking blocks.
|
||||||
|
|
||||||
|
vLLM should accept these blocks (to avoid 400 validation errors)
|
||||||
|
and ignore them when constructing the OpenAI-format prompt.
|
||||||
|
"""
|
||||||
|
request = _make_request(
|
||||||
|
[
|
||||||
|
{"role": "user", "content": "Hello"},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "thinking",
|
||||||
|
"thinking": "Thinking...",
|
||||||
|
"signature": "sig_think",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "redacted_thinking",
|
||||||
|
"data": "BASE64_OR_OTHER_OPAQUE_DATA",
|
||||||
|
},
|
||||||
|
{"type": "text", "text": "Hi!"},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{"role": "user", "content": "Continue"},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
result = _convert(request)
|
||||||
|
|
||||||
|
asst_msgs = [m for m in result.messages if m.get("role") == "assistant"]
|
||||||
|
assert len(asst_msgs) == 1
|
||||||
|
asst = asst_msgs[0]
|
||||||
|
|
||||||
|
# Redacted thinking is ignored, normal thinking still becomes reasoning.
|
||||||
|
assert asst.get("reasoning") == "Thinking..."
|
||||||
|
assert asst.get("content") == "Hi!"
|
||||||
|
|||||||
@@ -34,7 +34,14 @@ class AnthropicUsage(BaseModel):
|
|||||||
class AnthropicContentBlock(BaseModel):
|
class AnthropicContentBlock(BaseModel):
|
||||||
"""Content block in message"""
|
"""Content block in message"""
|
||||||
|
|
||||||
type: Literal["text", "image", "tool_use", "tool_result", "thinking"]
|
type: Literal[
|
||||||
|
"text",
|
||||||
|
"image",
|
||||||
|
"tool_use",
|
||||||
|
"tool_result",
|
||||||
|
"thinking",
|
||||||
|
"redacted_thinking",
|
||||||
|
]
|
||||||
text: str | None = None
|
text: str | None = None
|
||||||
# For image content
|
# For image content
|
||||||
source: dict[str, Any] | None = None
|
source: dict[str, Any] | None = None
|
||||||
@@ -48,6 +55,8 @@ class AnthropicContentBlock(BaseModel):
|
|||||||
# For thinking content
|
# For thinking content
|
||||||
thinking: str | None = None
|
thinking: str | None = None
|
||||||
signature: str | None = None
|
signature: str | None = None
|
||||||
|
# For redacted thinking content (safety-filtered by the API)
|
||||||
|
data: str | None = None
|
||||||
|
|
||||||
|
|
||||||
class AnthropicMessage(BaseModel):
|
class AnthropicMessage(BaseModel):
|
||||||
|
|||||||
@@ -224,6 +224,12 @@ class AnthropicServingMessages(OpenAIServingChat):
|
|||||||
content_parts.append({"type": "image_url", "image_url": {"url": image_url}})
|
content_parts.append({"type": "image_url", "image_url": {"url": image_url}})
|
||||||
elif block.type == "thinking" and block.thinking is not None:
|
elif block.type == "thinking" and block.thinking is not None:
|
||||||
reasoning_parts.append(block.thinking)
|
reasoning_parts.append(block.thinking)
|
||||||
|
elif block.type == "redacted_thinking":
|
||||||
|
# Redacted thinking blocks contain safety-filtered reasoning.
|
||||||
|
# We skip them as the content is opaque (base64 'data' field),
|
||||||
|
# but accepting the block prevents a validation error when the
|
||||||
|
# client echoes back the full assistant message.
|
||||||
|
pass
|
||||||
elif block.type == "tool_use":
|
elif block.type == "tool_use":
|
||||||
cls._convert_tool_use_block(block, tool_calls)
|
cls._convert_tool_use_block(block, tool_calls)
|
||||||
elif block.type == "tool_result":
|
elif block.type == "tool_result":
|
||||||
|
|||||||
Reference in New Issue
Block a user