diff --git a/tests/entrypoints/openai/test_anthropic_messages_conversion.py b/tests/entrypoints/openai/test_anthropic_messages_conversion.py new file mode 100644 index 000000000..3647c187f --- /dev/null +++ b/tests/entrypoints/openai/test_anthropic_messages_conversion.py @@ -0,0 +1,326 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for Anthropic-to-OpenAI request conversion. + +Tests the image source handling and tool_result content parsing in +AnthropicServingMessages._convert_anthropic_to_openai_request(). +""" + +from vllm.entrypoints.anthropic.protocol import ( + AnthropicMessagesRequest, +) +from vllm.entrypoints.anthropic.serving import AnthropicServingMessages + +_convert = AnthropicServingMessages._convert_anthropic_to_openai_request +_img_url = AnthropicServingMessages._convert_image_source_to_url + + +def _make_request( + messages: list[dict], + **kwargs, +) -> AnthropicMessagesRequest: + return AnthropicMessagesRequest( + model="test-model", + max_tokens=128, + messages=messages, + **kwargs, + ) + + +# ====================================================================== +# _convert_image_source_to_url +# ====================================================================== + + +class TestConvertImageSourceToUrl: + def test_base64_source(self): + source = { + "type": "base64", + "media_type": "image/jpeg", + "data": "iVBORw0KGgo=", + } + assert _img_url(source) == "data:image/jpeg;base64,iVBORw0KGgo=" + + def test_base64_png(self): + source = { + "type": "base64", + "media_type": "image/png", + "data": "AAAA", + } + assert _img_url(source) == "data:image/png;base64,AAAA" + + def test_url_source(self): + source = { + "type": "url", + "url": "https://example.com/image.jpg", + } + assert _img_url(source) == "https://example.com/image.jpg" + + def test_missing_type_defaults_to_base64(self): + """When 'type' is absent, treat as base64.""" + source = { + "media_type": "image/webp", + "data": "UklGR", + } + assert _img_url(source) == "data:image/webp;base64,UklGR" + + def test_missing_media_type_defaults_to_jpeg(self): + source = {"type": "base64", "data": "abc123"} + assert _img_url(source) == "data:image/jpeg;base64,abc123" + + def test_url_source_missing_url_returns_empty(self): + source = {"type": "url"} + assert _img_url(source) == "" + + def test_empty_source_returns_data_uri_shell(self): + source: dict = {} + assert _img_url(source) == "data:image/jpeg;base64," + + +# ====================================================================== +# Image blocks inside user messages +# ====================================================================== + + +class TestImageContentBlocks: + def test_base64_image_in_user_message(self): + request = _make_request( + [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this image"}, + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/jpeg", + "data": "iVBORw0KGgo=", + }, + }, + ], + } + ] + ) + + result = _convert(request) + user_msg = result.messages[0] + assert user_msg["role"] == "user" + + parts = user_msg["content"] + assert len(parts) == 2 + assert parts[0] == {"type": "text", "text": "Describe this image"} + assert parts[1] == { + "type": "image_url", + "image_url": {"url": "data:image/jpeg;base64,iVBORw0KGgo="}, + } + + def test_url_image_in_user_message(self): + request = _make_request( + [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What is this?"}, + { + "type": "image", + "source": { + "type": "url", + "url": "https://example.com/cat.png", + }, + }, + ], + } + ] + ) + + result = _convert(request) + parts = result.messages[0]["content"] + assert parts[1] == { + "type": "image_url", + "image_url": {"url": "https://example.com/cat.png"}, + } + + +# ====================================================================== +# tool_result content handling +# ====================================================================== + + +class TestToolResultContent: + def _make_tool_result_request( + self, tool_result_content + ) -> AnthropicMessagesRequest: + """Build a request with assistant tool_use followed by user + tool_result.""" + return _make_request( + [ + { + "role": "assistant", + "content": [ + { + "type": "tool_use", + "id": "call_001", + "name": "read_file", + "input": {"path": "/tmp/img.png"}, + } + ], + }, + { + "role": "user", + "content": [ + { + "type": "tool_result", + "tool_use_id": "call_001", + "content": tool_result_content, + } + ], + }, + ] + ) + + def test_tool_result_string_content(self): + request = self._make_tool_result_request("file contents here") + result = _convert(request) + + tool_msg = [m for m in result.messages if m["role"] == "tool"] + assert len(tool_msg) == 1 + assert tool_msg[0]["content"] == "file contents here" + assert tool_msg[0]["tool_call_id"] == "call_001" + + def test_tool_result_text_blocks(self): + request = self._make_tool_result_request( + [ + {"type": "text", "text": "line 1"}, + {"type": "text", "text": "line 2"}, + ] + ) + result = _convert(request) + + tool_msg = [m for m in result.messages if m["role"] == "tool"] + assert len(tool_msg) == 1 + assert tool_msg[0]["content"] == "line 1\nline 2" + + def test_tool_result_with_image(self): + """Image in tool_result should produce a follow-up user message.""" + request = self._make_tool_result_request( + [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": "AAAA", + }, + } + ] + ) + result = _convert(request) + + tool_msg = [m for m in result.messages if m["role"] == "tool"] + assert len(tool_msg) == 1 + assert tool_msg[0]["content"] == "" + + # The image should be injected as a follow-up user message + follow_up = [ + m + for m in result.messages + if m["role"] == "user" and isinstance(m.get("content"), list) + ] + assert len(follow_up) == 1 + img_parts = follow_up[0]["content"] + assert len(img_parts) == 1 + assert img_parts[0] == { + "type": "image_url", + "image_url": {"url": "data:image/png;base64,AAAA"}, + } + + def test_tool_result_with_text_and_image(self): + """Mixed text+image tool_result: text in tool msg, image in user + msg.""" + request = self._make_tool_result_request( + [ + {"type": "text", "text": "Here is the screenshot"}, + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/jpeg", + "data": "QUFB", + }, + }, + ] + ) + result = _convert(request) + + tool_msg = [m for m in result.messages if m["role"] == "tool"] + assert len(tool_msg) == 1 + assert tool_msg[0]["content"] == "Here is the screenshot" + + follow_up = [ + m + for m in result.messages + if m["role"] == "user" and isinstance(m.get("content"), list) + ] + assert len(follow_up) == 1 + assert follow_up[0]["content"][0]["image_url"]["url"] == ( + "data:image/jpeg;base64,QUFB" + ) + + def test_tool_result_with_multiple_images(self): + request = self._make_tool_result_request( + [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": "IMG1", + }, + }, + { + "type": "image", + "source": { + "type": "url", + "url": "https://example.com/img2.jpg", + }, + }, + ] + ) + result = _convert(request) + + follow_up = [ + m + for m in result.messages + if m["role"] == "user" and isinstance(m.get("content"), list) + ] + assert len(follow_up) == 1 + urls = [p["image_url"]["url"] for p in follow_up[0]["content"]] + assert urls == [ + "data:image/png;base64,IMG1", + "https://example.com/img2.jpg", + ] + + def test_tool_result_none_content(self): + request = self._make_tool_result_request(None) + result = _convert(request) + + tool_msg = [m for m in result.messages if m["role"] == "tool"] + assert len(tool_msg) == 1 + assert tool_msg[0]["content"] == "" + + def test_tool_result_no_follow_up_when_no_images(self): + """Ensure no extra user message is added when there are no images.""" + request = self._make_tool_result_request( + [ + {"type": "text", "text": "just text"}, + ] + ) + result = _convert(request) + + user_follow_ups = [ + m + for m in result.messages + if m["role"] == "user" and isinstance(m.get("content"), list) + ] + assert len(user_follow_ups) == 0 diff --git a/vllm/entrypoints/anthropic/serving.py b/vllm/entrypoints/anthropic/serving.py index 6318f854a..82af26476 100644 --- a/vllm/entrypoints/anthropic/serving.py +++ b/vllm/entrypoints/anthropic/serving.py @@ -86,8 +86,30 @@ class AnthropicServingMessages(OpenAIServingChat): "tool_calls": "tool_use", } + @staticmethod + def _convert_image_source_to_url(source: dict[str, Any]) -> str: + """Convert an Anthropic image source to an OpenAI-compatible URL. + + Anthropic supports two image source types: + - base64: {"type": "base64", "media_type": "image/jpeg", "data": "..."} + - url: {"type": "url", "url": "https://..."} + + For base64 sources, this constructs a proper data URI that + downstream processors (e.g. vLLM's media connector) can handle. + """ + source_type = source.get("type") + if source_type == "url": + return source.get("url", "") + # Default to base64 processing if type is "base64" + # or missing, ensuring a proper data URI is always + # constructed for non-URL sources. + media_type = source.get("media_type", "image/jpeg") + data = source.get("data", "") + return f"data:{media_type};base64,{data}" + + @classmethod def _convert_anthropic_to_openai_request( - self, anthropic_request: AnthropicMessagesRequest + cls, anthropic_request: AnthropicMessagesRequest ) -> ChatCompletionRequest: """Convert Anthropic message format to OpenAI format""" openai_messages = [] @@ -119,10 +141,11 @@ class AnthropicServingMessages(OpenAIServingChat): if block.type == "text" and block.text: content_parts.append({"type": "text", "text": block.text}) elif block.type == "image" and block.source: + image_url = cls._convert_image_source_to_url(block.source) content_parts.append( { "type": "image_url", - "image_url": {"url": block.source.get("data", "")}, + "image_url": {"url": image_url}, } ) elif block.type == "thinking" and block.thinking is not None: @@ -140,15 +163,50 @@ class AnthropicServingMessages(OpenAIServingChat): tool_calls.append(tool_call) elif block.type == "tool_result": if msg.role == "user": + # Parse tool_result content which can be + # a string or a list of content blocks + # (text, image, etc.) + tool_text = "" + tool_image_urls: list[str] = [] + if isinstance(block.content, str): + tool_text = block.content + elif isinstance(block.content, list): + text_parts: list[str] = [] + for item in block.content: + if not isinstance(item, dict): + continue + item_type = item.get("type") + if item_type == "text": + text_parts.append(item.get("text", "")) + elif item_type == "image": + source = item.get("source", {}) + url = cls._convert_image_source_to_url(source) + if url: + tool_image_urls.append(url) + tool_text = "\n".join(text_parts) openai_messages.append( { "role": "tool", "tool_call_id": block.tool_use_id or "", - "content": str(block.content) - if block.content - else "", + "content": tool_text or "", } ) + # OpenAI tool messages only support string + # content, so inject images from tool + # results as a follow-up user message + if tool_image_urls: + openai_messages.append( + { + "role": "user", + "content": [ # type: ignore[dict-item] + { + "type": "image_url", + "image_url": {"url": img}, + } + for img in tool_image_urls + ], + } + ) else: # Assistant tool result becomes regular text tool_result_text = (