From fca3f4665838605e268a8408bc7ca359f5d5c14b Mon Sep 17 00:00:00 2001 From: Benjamin Bartels Date: Thu, 4 Dec 2025 05:50:27 +0000 Subject: [PATCH] [Frontend] Fixes anthropic /v1/messages streaming not containing input_tokens on first chunk (#29971) Signed-off-by: bbartels --- tests/entrypoints/openai/test_messages.py | 11 +++++++++++ vllm/entrypoints/anthropic/serving_messages.py | 10 +++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_messages.py b/tests/entrypoints/openai/test_messages.py index 3e390ad49..b804a1a7a 100644 --- a/tests/entrypoints/openai/test_messages.py +++ b/tests/entrypoints/openai/test_messages.py @@ -69,9 +69,20 @@ async def test_anthropic_streaming(client: anthropic.AsyncAnthropic): stream=True, ) + first_chunk = None + chunk_count = 0 async for chunk in resp: + chunk_count += 1 + if first_chunk is None and chunk.type == "message_start": + first_chunk = chunk print(chunk.model_dump_json()) + assert chunk_count > 0 + assert first_chunk is not None, "message_start chunk was never observed" + assert first_chunk.usage is not None, "first chunk should include usage stats" + assert first_chunk.usage["output_tokens"] == 0 + assert first_chunk.usage["input_tokens"] > 5 + @pytest.mark.asyncio async def test_anthropic_tool_call(client: anthropic.AsyncAnthropic): diff --git a/vllm/entrypoints/anthropic/serving_messages.py b/vllm/entrypoints/anthropic/serving_messages.py index 340dabf0e..e7ea3bb59 100644 --- a/vllm/entrypoints/anthropic/serving_messages.py +++ b/vllm/entrypoints/anthropic/serving_messages.py @@ -183,7 +183,9 @@ class AnthropicServingMessages(OpenAIServingChat): if anthropic_request.stream: req.stream = anthropic_request.stream - req.stream_options = StreamOptions.validate({"include_usage": True}) + req.stream_options = StreamOptions.validate( + {"include_usage": True, "continuous_usage_stats": True} + ) if anthropic_request.tool_choice is None: req.tool_choice = None @@ -323,6 +325,12 @@ class AnthropicServingMessages(OpenAIServingChat): content=[], model=origin_chunk.model, ), + usage=AnthropicUsage( + input_tokens=origin_chunk.usage.prompt_tokens + if origin_chunk.usage + else 0, + output_tokens=0, + ), ) first_item = False data = chunk.model_dump_json(exclude_unset=True)