[Bugfix] Fix Anthropic API base64 image handling in Messages endpoint (#35557)
Signed-off-by: Martin Vit <martin@voipmonitor.org>
This commit is contained in:
326
tests/entrypoints/openai/test_anthropic_messages_conversion.py
Normal file
326
tests/entrypoints/openai/test_anthropic_messages_conversion.py
Normal file
@@ -0,0 +1,326 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
"""Unit tests for Anthropic-to-OpenAI request conversion.
|
||||||
|
|
||||||
|
Tests the image source handling and tool_result content parsing in
|
||||||
|
AnthropicServingMessages._convert_anthropic_to_openai_request().
|
||||||
|
"""
|
||||||
|
|
||||||
|
from vllm.entrypoints.anthropic.protocol import (
|
||||||
|
AnthropicMessagesRequest,
|
||||||
|
)
|
||||||
|
from vllm.entrypoints.anthropic.serving import AnthropicServingMessages
|
||||||
|
|
||||||
|
_convert = AnthropicServingMessages._convert_anthropic_to_openai_request
|
||||||
|
_img_url = AnthropicServingMessages._convert_image_source_to_url
|
||||||
|
|
||||||
|
|
||||||
|
def _make_request(
|
||||||
|
messages: list[dict],
|
||||||
|
**kwargs,
|
||||||
|
) -> AnthropicMessagesRequest:
|
||||||
|
return AnthropicMessagesRequest(
|
||||||
|
model="test-model",
|
||||||
|
max_tokens=128,
|
||||||
|
messages=messages,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================================
|
||||||
|
# _convert_image_source_to_url
|
||||||
|
# ======================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class TestConvertImageSourceToUrl:
|
||||||
|
def test_base64_source(self):
|
||||||
|
source = {
|
||||||
|
"type": "base64",
|
||||||
|
"media_type": "image/jpeg",
|
||||||
|
"data": "iVBORw0KGgo=",
|
||||||
|
}
|
||||||
|
assert _img_url(source) == "data:image/jpeg;base64,iVBORw0KGgo="
|
||||||
|
|
||||||
|
def test_base64_png(self):
|
||||||
|
source = {
|
||||||
|
"type": "base64",
|
||||||
|
"media_type": "image/png",
|
||||||
|
"data": "AAAA",
|
||||||
|
}
|
||||||
|
assert _img_url(source) == "data:image/png;base64,AAAA"
|
||||||
|
|
||||||
|
def test_url_source(self):
|
||||||
|
source = {
|
||||||
|
"type": "url",
|
||||||
|
"url": "https://example.com/image.jpg",
|
||||||
|
}
|
||||||
|
assert _img_url(source) == "https://example.com/image.jpg"
|
||||||
|
|
||||||
|
def test_missing_type_defaults_to_base64(self):
|
||||||
|
"""When 'type' is absent, treat as base64."""
|
||||||
|
source = {
|
||||||
|
"media_type": "image/webp",
|
||||||
|
"data": "UklGR",
|
||||||
|
}
|
||||||
|
assert _img_url(source) == "data:image/webp;base64,UklGR"
|
||||||
|
|
||||||
|
def test_missing_media_type_defaults_to_jpeg(self):
|
||||||
|
source = {"type": "base64", "data": "abc123"}
|
||||||
|
assert _img_url(source) == "data:image/jpeg;base64,abc123"
|
||||||
|
|
||||||
|
def test_url_source_missing_url_returns_empty(self):
|
||||||
|
source = {"type": "url"}
|
||||||
|
assert _img_url(source) == ""
|
||||||
|
|
||||||
|
def test_empty_source_returns_data_uri_shell(self):
|
||||||
|
source: dict = {}
|
||||||
|
assert _img_url(source) == "data:image/jpeg;base64,"
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================================
|
||||||
|
# Image blocks inside user messages
|
||||||
|
# ======================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class TestImageContentBlocks:
|
||||||
|
def test_base64_image_in_user_message(self):
|
||||||
|
request = _make_request(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "Describe this image"},
|
||||||
|
{
|
||||||
|
"type": "image",
|
||||||
|
"source": {
|
||||||
|
"type": "base64",
|
||||||
|
"media_type": "image/jpeg",
|
||||||
|
"data": "iVBORw0KGgo=",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
result = _convert(request)
|
||||||
|
user_msg = result.messages[0]
|
||||||
|
assert user_msg["role"] == "user"
|
||||||
|
|
||||||
|
parts = user_msg["content"]
|
||||||
|
assert len(parts) == 2
|
||||||
|
assert parts[0] == {"type": "text", "text": "Describe this image"}
|
||||||
|
assert parts[1] == {
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {"url": "data:image/jpeg;base64,iVBORw0KGgo="},
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_url_image_in_user_message(self):
|
||||||
|
request = _make_request(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "What is this?"},
|
||||||
|
{
|
||||||
|
"type": "image",
|
||||||
|
"source": {
|
||||||
|
"type": "url",
|
||||||
|
"url": "https://example.com/cat.png",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
result = _convert(request)
|
||||||
|
parts = result.messages[0]["content"]
|
||||||
|
assert parts[1] == {
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {"url": "https://example.com/cat.png"},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================================
|
||||||
|
# tool_result content handling
|
||||||
|
# ======================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class TestToolResultContent:
|
||||||
|
def _make_tool_result_request(
|
||||||
|
self, tool_result_content
|
||||||
|
) -> AnthropicMessagesRequest:
|
||||||
|
"""Build a request with assistant tool_use followed by user
|
||||||
|
tool_result."""
|
||||||
|
return _make_request(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "tool_use",
|
||||||
|
"id": "call_001",
|
||||||
|
"name": "read_file",
|
||||||
|
"input": {"path": "/tmp/img.png"},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "tool_result",
|
||||||
|
"tool_use_id": "call_001",
|
||||||
|
"content": tool_result_content,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_tool_result_string_content(self):
|
||||||
|
request = self._make_tool_result_request("file contents here")
|
||||||
|
result = _convert(request)
|
||||||
|
|
||||||
|
tool_msg = [m for m in result.messages if m["role"] == "tool"]
|
||||||
|
assert len(tool_msg) == 1
|
||||||
|
assert tool_msg[0]["content"] == "file contents here"
|
||||||
|
assert tool_msg[0]["tool_call_id"] == "call_001"
|
||||||
|
|
||||||
|
def test_tool_result_text_blocks(self):
|
||||||
|
request = self._make_tool_result_request(
|
||||||
|
[
|
||||||
|
{"type": "text", "text": "line 1"},
|
||||||
|
{"type": "text", "text": "line 2"},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
result = _convert(request)
|
||||||
|
|
||||||
|
tool_msg = [m for m in result.messages if m["role"] == "tool"]
|
||||||
|
assert len(tool_msg) == 1
|
||||||
|
assert tool_msg[0]["content"] == "line 1\nline 2"
|
||||||
|
|
||||||
|
def test_tool_result_with_image(self):
|
||||||
|
"""Image in tool_result should produce a follow-up user message."""
|
||||||
|
request = self._make_tool_result_request(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"type": "image",
|
||||||
|
"source": {
|
||||||
|
"type": "base64",
|
||||||
|
"media_type": "image/png",
|
||||||
|
"data": "AAAA",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
result = _convert(request)
|
||||||
|
|
||||||
|
tool_msg = [m for m in result.messages if m["role"] == "tool"]
|
||||||
|
assert len(tool_msg) == 1
|
||||||
|
assert tool_msg[0]["content"] == ""
|
||||||
|
|
||||||
|
# The image should be injected as a follow-up user message
|
||||||
|
follow_up = [
|
||||||
|
m
|
||||||
|
for m in result.messages
|
||||||
|
if m["role"] == "user" and isinstance(m.get("content"), list)
|
||||||
|
]
|
||||||
|
assert len(follow_up) == 1
|
||||||
|
img_parts = follow_up[0]["content"]
|
||||||
|
assert len(img_parts) == 1
|
||||||
|
assert img_parts[0] == {
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {"url": "data:image/png;base64,AAAA"},
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_tool_result_with_text_and_image(self):
|
||||||
|
"""Mixed text+image tool_result: text in tool msg, image in user
|
||||||
|
msg."""
|
||||||
|
request = self._make_tool_result_request(
|
||||||
|
[
|
||||||
|
{"type": "text", "text": "Here is the screenshot"},
|
||||||
|
{
|
||||||
|
"type": "image",
|
||||||
|
"source": {
|
||||||
|
"type": "base64",
|
||||||
|
"media_type": "image/jpeg",
|
||||||
|
"data": "QUFB",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
result = _convert(request)
|
||||||
|
|
||||||
|
tool_msg = [m for m in result.messages if m["role"] == "tool"]
|
||||||
|
assert len(tool_msg) == 1
|
||||||
|
assert tool_msg[0]["content"] == "Here is the screenshot"
|
||||||
|
|
||||||
|
follow_up = [
|
||||||
|
m
|
||||||
|
for m in result.messages
|
||||||
|
if m["role"] == "user" and isinstance(m.get("content"), list)
|
||||||
|
]
|
||||||
|
assert len(follow_up) == 1
|
||||||
|
assert follow_up[0]["content"][0]["image_url"]["url"] == (
|
||||||
|
"data:image/jpeg;base64,QUFB"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_tool_result_with_multiple_images(self):
|
||||||
|
request = self._make_tool_result_request(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"type": "image",
|
||||||
|
"source": {
|
||||||
|
"type": "base64",
|
||||||
|
"media_type": "image/png",
|
||||||
|
"data": "IMG1",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image",
|
||||||
|
"source": {
|
||||||
|
"type": "url",
|
||||||
|
"url": "https://example.com/img2.jpg",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
result = _convert(request)
|
||||||
|
|
||||||
|
follow_up = [
|
||||||
|
m
|
||||||
|
for m in result.messages
|
||||||
|
if m["role"] == "user" and isinstance(m.get("content"), list)
|
||||||
|
]
|
||||||
|
assert len(follow_up) == 1
|
||||||
|
urls = [p["image_url"]["url"] for p in follow_up[0]["content"]]
|
||||||
|
assert urls == [
|
||||||
|
"data:image/png;base64,IMG1",
|
||||||
|
"https://example.com/img2.jpg",
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_tool_result_none_content(self):
|
||||||
|
request = self._make_tool_result_request(None)
|
||||||
|
result = _convert(request)
|
||||||
|
|
||||||
|
tool_msg = [m for m in result.messages if m["role"] == "tool"]
|
||||||
|
assert len(tool_msg) == 1
|
||||||
|
assert tool_msg[0]["content"] == ""
|
||||||
|
|
||||||
|
def test_tool_result_no_follow_up_when_no_images(self):
|
||||||
|
"""Ensure no extra user message is added when there are no images."""
|
||||||
|
request = self._make_tool_result_request(
|
||||||
|
[
|
||||||
|
{"type": "text", "text": "just text"},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
result = _convert(request)
|
||||||
|
|
||||||
|
user_follow_ups = [
|
||||||
|
m
|
||||||
|
for m in result.messages
|
||||||
|
if m["role"] == "user" and isinstance(m.get("content"), list)
|
||||||
|
]
|
||||||
|
assert len(user_follow_ups) == 0
|
||||||
@@ -86,8 +86,30 @@ class AnthropicServingMessages(OpenAIServingChat):
|
|||||||
"tool_calls": "tool_use",
|
"tool_calls": "tool_use",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _convert_image_source_to_url(source: dict[str, Any]) -> str:
|
||||||
|
"""Convert an Anthropic image source to an OpenAI-compatible URL.
|
||||||
|
|
||||||
|
Anthropic supports two image source types:
|
||||||
|
- base64: {"type": "base64", "media_type": "image/jpeg", "data": "..."}
|
||||||
|
- url: {"type": "url", "url": "https://..."}
|
||||||
|
|
||||||
|
For base64 sources, this constructs a proper data URI that
|
||||||
|
downstream processors (e.g. vLLM's media connector) can handle.
|
||||||
|
"""
|
||||||
|
source_type = source.get("type")
|
||||||
|
if source_type == "url":
|
||||||
|
return source.get("url", "")
|
||||||
|
# Default to base64 processing if type is "base64"
|
||||||
|
# or missing, ensuring a proper data URI is always
|
||||||
|
# constructed for non-URL sources.
|
||||||
|
media_type = source.get("media_type", "image/jpeg")
|
||||||
|
data = source.get("data", "")
|
||||||
|
return f"data:{media_type};base64,{data}"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
def _convert_anthropic_to_openai_request(
|
def _convert_anthropic_to_openai_request(
|
||||||
self, anthropic_request: AnthropicMessagesRequest
|
cls, anthropic_request: AnthropicMessagesRequest
|
||||||
) -> ChatCompletionRequest:
|
) -> ChatCompletionRequest:
|
||||||
"""Convert Anthropic message format to OpenAI format"""
|
"""Convert Anthropic message format to OpenAI format"""
|
||||||
openai_messages = []
|
openai_messages = []
|
||||||
@@ -119,10 +141,11 @@ class AnthropicServingMessages(OpenAIServingChat):
|
|||||||
if block.type == "text" and block.text:
|
if block.type == "text" and block.text:
|
||||||
content_parts.append({"type": "text", "text": block.text})
|
content_parts.append({"type": "text", "text": block.text})
|
||||||
elif block.type == "image" and block.source:
|
elif block.type == "image" and block.source:
|
||||||
|
image_url = cls._convert_image_source_to_url(block.source)
|
||||||
content_parts.append(
|
content_parts.append(
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
"image_url": {"url": block.source.get("data", "")},
|
"image_url": {"url": image_url},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
elif block.type == "thinking" and block.thinking is not None:
|
elif block.type == "thinking" and block.thinking is not None:
|
||||||
@@ -140,15 +163,50 @@ class AnthropicServingMessages(OpenAIServingChat):
|
|||||||
tool_calls.append(tool_call)
|
tool_calls.append(tool_call)
|
||||||
elif block.type == "tool_result":
|
elif block.type == "tool_result":
|
||||||
if msg.role == "user":
|
if msg.role == "user":
|
||||||
|
# Parse tool_result content which can be
|
||||||
|
# a string or a list of content blocks
|
||||||
|
# (text, image, etc.)
|
||||||
|
tool_text = ""
|
||||||
|
tool_image_urls: list[str] = []
|
||||||
|
if isinstance(block.content, str):
|
||||||
|
tool_text = block.content
|
||||||
|
elif isinstance(block.content, list):
|
||||||
|
text_parts: list[str] = []
|
||||||
|
for item in block.content:
|
||||||
|
if not isinstance(item, dict):
|
||||||
|
continue
|
||||||
|
item_type = item.get("type")
|
||||||
|
if item_type == "text":
|
||||||
|
text_parts.append(item.get("text", ""))
|
||||||
|
elif item_type == "image":
|
||||||
|
source = item.get("source", {})
|
||||||
|
url = cls._convert_image_source_to_url(source)
|
||||||
|
if url:
|
||||||
|
tool_image_urls.append(url)
|
||||||
|
tool_text = "\n".join(text_parts)
|
||||||
openai_messages.append(
|
openai_messages.append(
|
||||||
{
|
{
|
||||||
"role": "tool",
|
"role": "tool",
|
||||||
"tool_call_id": block.tool_use_id or "",
|
"tool_call_id": block.tool_use_id or "",
|
||||||
"content": str(block.content)
|
"content": tool_text or "",
|
||||||
if block.content
|
|
||||||
else "",
|
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
# OpenAI tool messages only support string
|
||||||
|
# content, so inject images from tool
|
||||||
|
# results as a follow-up user message
|
||||||
|
if tool_image_urls:
|
||||||
|
openai_messages.append(
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [ # type: ignore[dict-item]
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {"url": img},
|
||||||
|
}
|
||||||
|
for img in tool_image_urls
|
||||||
|
],
|
||||||
|
}
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# Assistant tool result becomes regular text
|
# Assistant tool result becomes regular text
|
||||||
tool_result_text = (
|
tool_result_text = (
|
||||||
|
|||||||
Reference in New Issue
Block a user