diff --git a/docs/serving/integrations/claude_code.md b/docs/serving/integrations/claude_code.md index 716c85231..99a89a076 100644 --- a/docs/serving/integrations/claude_code.md +++ b/docs/serving/integrations/claude_code.md @@ -60,6 +60,9 @@ The environment variables: !!! tip You can add these environment variables to your shell profile (e.g., `.bashrc`, `.zshrc`), Claude Code configuration file (`~/.claude/settings.json`), or create a wrapper script for convenience. +!!! warning + Claude Code recently started injecting a per-request hash in the system prompt, which can defeat [prefix caching](../../design/prefix_caching.md) because the prompt changes on every request, causing greatly reduced performance. This is addressed automatically in vLLM versions > 0.17.1 but for older versions `"CLAUDE_CODE_ATTRIBUTION_HEADER": "0"` should be added to the `"env"` section of `~/.claude/settings.json` (see this [blog post](https://unsloth.ai/docs/basics/claude-code#fixing-90-slower-inference-in-claude-code) from Unsloth). + ## Testing the Setup Once Claude Code launches, try a simple prompt to verify the connection: diff --git a/tests/entrypoints/openai/test_anthropic_messages_conversion.py b/tests/entrypoints/openai/test_anthropic_messages_conversion.py index 3647c187f..e3b006c16 100644 --- a/tests/entrypoints/openai/test_anthropic_messages_conversion.py +++ b/tests/entrypoints/openai/test_anthropic_messages_conversion.py @@ -324,3 +324,52 @@ class TestToolResultContent: if m["role"] == "user" and isinstance(m.get("content"), list) ] assert len(user_follow_ups) == 0 + + +# ====================================================================== +# Attribution header stripping +# ====================================================================== + + +class TestAttributionHeaderStripping: + def test_billing_header_stripped_from_system(self): + """Claude Code's x-anthropic-billing-header block should be + stripped to preserve prefix caching.""" + request = _make_request( + [{"role": "user", "content": "Hello"}], + system=[ + {"type": "text", "text": "You are a helpful assistant."}, + { + "type": "text", + "text": "x-anthropic-billing-header: " + "cc_version=2.1.37.abc; cc_entrypoint=cli;", + }, + ], + ) + result = _convert(request) + system_msg = result.messages[0] + assert system_msg["role"] == "system" + assert system_msg["content"] == "You are a helpful assistant." + + def test_system_without_billing_header_unchanged(self): + """Normal system blocks should pass through unchanged.""" + request = _make_request( + [{"role": "user", "content": "Hello"}], + system=[ + {"type": "text", "text": "You are a helpful assistant."}, + {"type": "text", "text": " Be concise."}, + ], + ) + result = _convert(request) + system_msg = result.messages[0] + assert system_msg["content"] == "You are a helpful assistant. Be concise." + + def test_system_string_unchanged(self): + """String system prompts should pass through unchanged.""" + request = _make_request( + [{"role": "user", "content": "Hello"}], + system="You are a helpful assistant.", + ) + result = _convert(request) + system_msg = result.messages[0] + assert system_msg["content"] == "You are a helpful assistant." diff --git a/vllm/entrypoints/anthropic/serving.py b/vllm/entrypoints/anthropic/serving.py index 85232e918..a536ae77a 100644 --- a/vllm/entrypoints/anthropic/serving.py +++ b/vllm/entrypoints/anthropic/serving.py @@ -143,6 +143,10 @@ class AnthropicServingMessages(OpenAIServingChat): system_prompt = "" for block in anthropic_request.system: if block.type == "text" and block.text: + # Strip Claude Code's attribution header which contains + # a per-request hash that defeats prefix caching. + if block.text.startswith("x-anthropic-billing-header"): + continue system_prompt += block.text openai_messages.append({"role": "system", "content": system_prompt})