[Frontend] Exclude anthropic billing header to avoid prefix cache miss (#36829)

Signed-off-by: Nick Hill <nickhill123@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-03-11 18:20:34 -07:00
parent c34ba6b961
commit 262b76a09f
3 changed files with 56 additions and 0 deletions
--- a/docs/serving/integrations/claude_code.md
+++ b/docs/serving/integrations/claude_code.md
@@ -60,6 +60,9 @@ The environment variables:
 !!! tip
    You can add these environment variables to your shell profile (e.g., `.bashrc`, `.zshrc`), Claude Code configuration file (`~/.claude/settings.json`), or create a wrapper script for convenience.

+!!! warning
+    Claude Code recently started injecting a per-request hash in the system prompt, which can defeat [prefix caching](../../design/prefix_caching.md) because the prompt changes on every request, causing greatly reduced performance. This is addressed automatically in vLLM versions > 0.17.1 but for older versions `"CLAUDE_CODE_ATTRIBUTION_HEADER": "0"` should be added to the `"env"` section of `~/.claude/settings.json` (see this [blog post](https://unsloth.ai/docs/basics/claude-code#fixing-90-slower-inference-in-claude-code) from Unsloth).
+
 ## Testing the Setup

 Once Claude Code launches, try a simple prompt to verify the connection:
--- a/tests/entrypoints/openai/test_anthropic_messages_conversion.py
+++ b/tests/entrypoints/openai/test_anthropic_messages_conversion.py
@@ -324,3 +324,52 @@ class TestToolResultContent:
            if m["role"] == "user" and isinstance(m.get("content"), list)
        ]
        assert len(user_follow_ups) == 0
+
+
+# ======================================================================
+# Attribution header stripping
+# ======================================================================
+
+
+class TestAttributionHeaderStripping:
+    def test_billing_header_stripped_from_system(self):
+        """Claude Code's x-anthropic-billing-header block should be
+        stripped to preserve prefix caching."""
+        request = _make_request(
+            [{"role": "user", "content": "Hello"}],
+            system=[
+                {"type": "text", "text": "You are a helpful assistant."},
+                {
+                    "type": "text",
+                    "text": "x-anthropic-billing-header: "
+                    "cc_version=2.1.37.abc; cc_entrypoint=cli;",
+                },
+            ],
+        )
+        result = _convert(request)
+        system_msg = result.messages[0]
+        assert system_msg["role"] == "system"
+        assert system_msg["content"] == "You are a helpful assistant."
+
+    def test_system_without_billing_header_unchanged(self):
+        """Normal system blocks should pass through unchanged."""
+        request = _make_request(
+            [{"role": "user", "content": "Hello"}],
+            system=[
+                {"type": "text", "text": "You are a helpful assistant."},
+                {"type": "text", "text": " Be concise."},
+            ],
+        )
+        result = _convert(request)
+        system_msg = result.messages[0]
+        assert system_msg["content"] == "You are a helpful assistant. Be concise."
+
+    def test_system_string_unchanged(self):
+        """String system prompts should pass through unchanged."""
+        request = _make_request(
+            [{"role": "user", "content": "Hello"}],
+            system="You are a helpful assistant.",
+        )
+        result = _convert(request)
+        system_msg = result.messages[0]
+        assert system_msg["content"] == "You are a helpful assistant."
--- a/vllm/entrypoints/anthropic/serving.py
+++ b/vllm/entrypoints/anthropic/serving.py
@@ -143,6 +143,10 @@ class AnthropicServingMessages(OpenAIServingChat):
            system_prompt = ""
            for block in anthropic_request.system:
                if block.type == "text" and block.text:
+                    # Strip Claude Code's attribution header which contains
+                    # a per-request hash that defeats prefix caching.
+                    if block.text.startswith("x-anthropic-billing-header"):
+                        continue
                    system_prompt += block.text
            openai_messages.append({"role": "system", "content": system_prompt})