diff --git a/tests/entrypoints/openai/test_messages.py b/tests/entrypoints/openai/test_messages.py index 3e390ad49..b804a1a7a 100644 --- a/tests/entrypoints/openai/test_messages.py +++ b/tests/entrypoints/openai/test_messages.py @@ -69,9 +69,20 @@ async def test_anthropic_streaming(client: anthropic.AsyncAnthropic): stream=True, ) + first_chunk = None + chunk_count = 0 async for chunk in resp: + chunk_count += 1 + if first_chunk is None and chunk.type == "message_start": + first_chunk = chunk print(chunk.model_dump_json()) + assert chunk_count > 0 + assert first_chunk is not None, "message_start chunk was never observed" + assert first_chunk.usage is not None, "first chunk should include usage stats" + assert first_chunk.usage["output_tokens"] == 0 + assert first_chunk.usage["input_tokens"] > 5 + @pytest.mark.asyncio async def test_anthropic_tool_call(client: anthropic.AsyncAnthropic): diff --git a/vllm/entrypoints/anthropic/serving_messages.py b/vllm/entrypoints/anthropic/serving_messages.py index 340dabf0e..e7ea3bb59 100644 --- a/vllm/entrypoints/anthropic/serving_messages.py +++ b/vllm/entrypoints/anthropic/serving_messages.py @@ -183,7 +183,9 @@ class AnthropicServingMessages(OpenAIServingChat): if anthropic_request.stream: req.stream = anthropic_request.stream - req.stream_options = StreamOptions.validate({"include_usage": True}) + req.stream_options = StreamOptions.validate( + {"include_usage": True, "continuous_usage_stats": True} + ) if anthropic_request.tool_choice is None: req.tool_choice = None @@ -323,6 +325,12 @@ class AnthropicServingMessages(OpenAIServingChat): content=[], model=origin_chunk.model, ), + usage=AnthropicUsage( + input_tokens=origin_chunk.usage.prompt_tokens + if origin_chunk.usage + else 0, + output_tokens=0, + ), ) first_item = False data = chunk.model_dump_json(exclude_unset=True)