[Frontend] Exclude anthropic billing header to avoid prefix cache miss (#36829)

Signed-off-by: Nick Hill <nickhill123@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
Nick Hill
2026-03-11 18:20:34 -07:00
committed by GitHub
parent c34ba6b961
commit 262b76a09f
3 changed files with 56 additions and 0 deletions

View File

@@ -60,6 +60,9 @@ The environment variables:
!!! tip
You can add these environment variables to your shell profile (e.g., `.bashrc`, `.zshrc`), Claude Code configuration file (`~/.claude/settings.json`), or create a wrapper script for convenience.
!!! warning
Claude Code recently started injecting a per-request hash in the system prompt, which can defeat [prefix caching](../../design/prefix_caching.md) because the prompt changes on every request, causing greatly reduced performance. This is addressed automatically in vLLM versions > 0.17.1 but for older versions `"CLAUDE_CODE_ATTRIBUTION_HEADER": "0"` should be added to the `"env"` section of `~/.claude/settings.json` (see this [blog post](https://unsloth.ai/docs/basics/claude-code#fixing-90-slower-inference-in-claude-code) from Unsloth).
## Testing the Setup
Once Claude Code launches, try a simple prompt to verify the connection:

View File

@@ -324,3 +324,52 @@ class TestToolResultContent:
if m["role"] == "user" and isinstance(m.get("content"), list)
]
assert len(user_follow_ups) == 0
# ======================================================================
# Attribution header stripping
# ======================================================================
class TestAttributionHeaderStripping:
def test_billing_header_stripped_from_system(self):
"""Claude Code's x-anthropic-billing-header block should be
stripped to preserve prefix caching."""
request = _make_request(
[{"role": "user", "content": "Hello"}],
system=[
{"type": "text", "text": "You are a helpful assistant."},
{
"type": "text",
"text": "x-anthropic-billing-header: "
"cc_version=2.1.37.abc; cc_entrypoint=cli;",
},
],
)
result = _convert(request)
system_msg = result.messages[0]
assert system_msg["role"] == "system"
assert system_msg["content"] == "You are a helpful assistant."
def test_system_without_billing_header_unchanged(self):
"""Normal system blocks should pass through unchanged."""
request = _make_request(
[{"role": "user", "content": "Hello"}],
system=[
{"type": "text", "text": "You are a helpful assistant."},
{"type": "text", "text": " Be concise."},
],
)
result = _convert(request)
system_msg = result.messages[0]
assert system_msg["content"] == "You are a helpful assistant. Be concise."
def test_system_string_unchanged(self):
"""String system prompts should pass through unchanged."""
request = _make_request(
[{"role": "user", "content": "Hello"}],
system="You are a helpful assistant.",
)
result = _convert(request)
system_msg = result.messages[0]
assert system_msg["content"] == "You are a helpful assistant."

View File

@@ -143,6 +143,10 @@ class AnthropicServingMessages(OpenAIServingChat):
system_prompt = ""
for block in anthropic_request.system:
if block.type == "text" and block.text:
# Strip Claude Code's attribution header which contains
# a per-request hash that defeats prefix caching.
if block.text.startswith("x-anthropic-billing-header"):
continue
system_prompt += block.text
openai_messages.append({"role": "system", "content": system_prompt})