[Bugfix] Fix misleading context length error messages (#36197)
Signed-off-by: AjAnubolu <anuboluajay@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -200,7 +200,7 @@ def test_chat_batch_failure_cleanup(llm_for_failure_test):
|
||||
batch_2 = [valid_msg, valid_msg]
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=10)
|
||||
|
||||
with pytest.raises(ValueError, match="context length is only"):
|
||||
with pytest.raises(ValueError, match="maximum context length is"):
|
||||
llm.chat(batch_1, sampling_params=sampling_params)
|
||||
assert llm.llm_engine.get_num_unfinished_requests() == 0
|
||||
|
||||
|
||||
@@ -271,7 +271,7 @@ class TestRenderPrompt:
|
||||
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="input characters and requested .* context length is only",
|
||||
match="maximum context length is",
|
||||
):
|
||||
renderer.tokenize_prompts(
|
||||
prompts,
|
||||
@@ -292,7 +292,7 @@ class TestRenderPrompt:
|
||||
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="input tokens and requested .* context length is only",
|
||||
match="maximum context length is",
|
||||
):
|
||||
renderer.tokenize_prompts(
|
||||
prompts,
|
||||
@@ -313,7 +313,7 @@ class TestRenderPrompt:
|
||||
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="input tokens and requested .* context length is only",
|
||||
match="maximum context length is",
|
||||
):
|
||||
renderer.tokenize_prompts(
|
||||
prompts,
|
||||
|
||||
@@ -791,11 +791,15 @@ class OpenAIServing:
|
||||
|
||||
if max_tokens is not None and token_num + max_tokens > max_model_len:
|
||||
raise VLLMValidationError(
|
||||
"'max_tokens' or 'max_completion_tokens' is too large: "
|
||||
f"{max_tokens}. This model's maximum context length is "
|
||||
f"{max_model_len} tokens and your request has "
|
||||
f"{token_num} input tokens ({max_tokens} > {max_model_len}"
|
||||
f" - {token_num}).",
|
||||
f"This model's maximum context length is "
|
||||
f"{max_model_len} tokens. However, you requested "
|
||||
f"{max_tokens} output tokens and your prompt contains "
|
||||
f"{token_num} input tokens, for a total of "
|
||||
f"{token_num + max_tokens} tokens "
|
||||
f"({token_num} + {max_tokens} = "
|
||||
f"{token_num + max_tokens} > {max_model_len}). "
|
||||
f"Please reduce the length of the input prompt or the "
|
||||
f"number of requested output tokens.",
|
||||
parameter="max_tokens",
|
||||
value=max_tokens,
|
||||
)
|
||||
|
||||
@@ -253,13 +253,14 @@ class TokenizeParams:
|
||||
# To save resources, fail the request outright without even
|
||||
# attempting tokenization
|
||||
raise VLLMValidationError(
|
||||
f"You passed {len(text)} input characters "
|
||||
f"and requested {self.max_output_tokens} output tokens. "
|
||||
f"However, the model's context length is only "
|
||||
f"{self.max_total_tokens} tokens, resulting in a maximum "
|
||||
f"input length of {max_input_tokens} tokens "
|
||||
f"(at most {max_input_chars} characters). "
|
||||
f"Please reduce the length of the input prompt.",
|
||||
f"This model's maximum context length is "
|
||||
f"{self.max_total_tokens} tokens. However, you requested "
|
||||
f"{self.max_output_tokens} output tokens and your prompt "
|
||||
f"contains {len(text)} characters (more than "
|
||||
f"{max_input_chars} characters, which is the upper bound "
|
||||
f"for {max_input_tokens} input tokens). "
|
||||
f"Please reduce the length of the input prompt or the "
|
||||
f"number of requested output tokens.",
|
||||
parameter="input_text",
|
||||
value=len(text),
|
||||
)
|
||||
@@ -334,15 +335,22 @@ class TokenizeParams:
|
||||
return tokens
|
||||
|
||||
if len(tokens) > max_input_tokens:
|
||||
token_count = len(tokens)
|
||||
# The tokenizer may have truncated the prompt to
|
||||
# max_input_tokens + 1 (see get_encode_kwargs), so the
|
||||
# actual prompt length could be larger.
|
||||
qualifier = "at least " if token_count == max_input_tokens + 1 else ""
|
||||
total = token_count + self.max_output_tokens
|
||||
raise VLLMValidationError(
|
||||
f"You passed {len(tokens)} input tokens "
|
||||
f"and requested {self.max_output_tokens} output tokens. "
|
||||
f"However, the model's context length is only "
|
||||
f"{self.max_total_tokens} tokens, resulting in a maximum "
|
||||
f"input length of {max_input_tokens} tokens. "
|
||||
f"Please reduce the length of the input prompt.",
|
||||
f"This model's maximum context length is "
|
||||
f"{self.max_total_tokens} tokens. However, you requested "
|
||||
f"{self.max_output_tokens} output tokens and your prompt "
|
||||
f"contains {qualifier}{token_count} input tokens, "
|
||||
f"for a total of {qualifier}{total} tokens. "
|
||||
f"Please reduce the length of the input prompt or the "
|
||||
f"number of requested output tokens.",
|
||||
parameter="input_tokens",
|
||||
value=len(tokens),
|
||||
value=token_count,
|
||||
)
|
||||
|
||||
return tokens
|
||||
|
||||
Reference in New Issue
Block a user