[Frontend] Exclude anthropic billing header to avoid prefix cache miss (#36829)
Signed-off-by: Nick Hill <nickhill123@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@@ -60,6 +60,9 @@ The environment variables:
|
||||
!!! tip
|
||||
You can add these environment variables to your shell profile (e.g., `.bashrc`, `.zshrc`), Claude Code configuration file (`~/.claude/settings.json`), or create a wrapper script for convenience.
|
||||
|
||||
!!! warning
|
||||
Claude Code recently started injecting a per-request hash in the system prompt, which can defeat [prefix caching](../../design/prefix_caching.md) because the prompt changes on every request, causing greatly reduced performance. This is addressed automatically in vLLM versions > 0.17.1 but for older versions `"CLAUDE_CODE_ATTRIBUTION_HEADER": "0"` should be added to the `"env"` section of `~/.claude/settings.json` (see this [blog post](https://unsloth.ai/docs/basics/claude-code#fixing-90-slower-inference-in-claude-code) from Unsloth).
|
||||
|
||||
## Testing the Setup
|
||||
|
||||
Once Claude Code launches, try a simple prompt to verify the connection:
|
||||
|
||||
@@ -324,3 +324,52 @@ class TestToolResultContent:
|
||||
if m["role"] == "user" and isinstance(m.get("content"), list)
|
||||
]
|
||||
assert len(user_follow_ups) == 0
|
||||
|
||||
|
||||
# ======================================================================
|
||||
# Attribution header stripping
|
||||
# ======================================================================
|
||||
|
||||
|
||||
class TestAttributionHeaderStripping:
|
||||
def test_billing_header_stripped_from_system(self):
|
||||
"""Claude Code's x-anthropic-billing-header block should be
|
||||
stripped to preserve prefix caching."""
|
||||
request = _make_request(
|
||||
[{"role": "user", "content": "Hello"}],
|
||||
system=[
|
||||
{"type": "text", "text": "You are a helpful assistant."},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "x-anthropic-billing-header: "
|
||||
"cc_version=2.1.37.abc; cc_entrypoint=cli;",
|
||||
},
|
||||
],
|
||||
)
|
||||
result = _convert(request)
|
||||
system_msg = result.messages[0]
|
||||
assert system_msg["role"] == "system"
|
||||
assert system_msg["content"] == "You are a helpful assistant."
|
||||
|
||||
def test_system_without_billing_header_unchanged(self):
|
||||
"""Normal system blocks should pass through unchanged."""
|
||||
request = _make_request(
|
||||
[{"role": "user", "content": "Hello"}],
|
||||
system=[
|
||||
{"type": "text", "text": "You are a helpful assistant."},
|
||||
{"type": "text", "text": " Be concise."},
|
||||
],
|
||||
)
|
||||
result = _convert(request)
|
||||
system_msg = result.messages[0]
|
||||
assert system_msg["content"] == "You are a helpful assistant. Be concise."
|
||||
|
||||
def test_system_string_unchanged(self):
|
||||
"""String system prompts should pass through unchanged."""
|
||||
request = _make_request(
|
||||
[{"role": "user", "content": "Hello"}],
|
||||
system="You are a helpful assistant.",
|
||||
)
|
||||
result = _convert(request)
|
||||
system_msg = result.messages[0]
|
||||
assert system_msg["content"] == "You are a helpful assistant."
|
||||
|
||||
@@ -143,6 +143,10 @@ class AnthropicServingMessages(OpenAIServingChat):
|
||||
system_prompt = ""
|
||||
for block in anthropic_request.system:
|
||||
if block.type == "text" and block.text:
|
||||
# Strip Claude Code's attribution header which contains
|
||||
# a per-request hash that defeats prefix caching.
|
||||
if block.text.startswith("x-anthropic-billing-header"):
|
||||
continue
|
||||
system_prompt += block.text
|
||||
openai_messages.append({"role": "system", "content": system_prompt})
|
||||
|
||||
|
||||
Reference in New Issue
Block a user