[Bugfix] Fix misleading context length error messages (#36197)

Signed-off-by: AjAnubolu <anuboluajay@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-05 22:15:12 -08:00
parent 86e1060b17
commit 43f10573c9
4 changed files with 35 additions and 23 deletions
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -200,7 +200,7 @@ def test_chat_batch_failure_cleanup(llm_for_failure_test):
    batch_2 = [valid_msg, valid_msg]
    sampling_params = SamplingParams(temperature=0, max_tokens=10)

-    with pytest.raises(ValueError, match="context length is only"):
+    with pytest.raises(ValueError, match="maximum context length is"):
        llm.chat(batch_1, sampling_params=sampling_params)
    assert llm.llm_engine.get_num_unfinished_requests() == 0

--- a/tests/renderers/test_completions.py
+++ b/tests/renderers/test_completions.py
@@ -271,7 +271,7 @@ class TestRenderPrompt:

        with pytest.raises(
            ValueError,
-            match="input characters and requested .* context length is only",
+            match="maximum context length is",
        ):
            renderer.tokenize_prompts(
                prompts,
@@ -292,7 +292,7 @@ class TestRenderPrompt:

        with pytest.raises(
            ValueError,
-            match="input tokens and requested .* context length is only",
+            match="maximum context length is",
        ):
            renderer.tokenize_prompts(
                prompts,
@@ -313,7 +313,7 @@ class TestRenderPrompt:

        with pytest.raises(
            ValueError,
-            match="input tokens and requested .* context length is only",
+            match="maximum context length is",
        ):
            renderer.tokenize_prompts(
                prompts,
--- a/vllm/entrypoints/openai/engine/serving.py
+++ b/vllm/entrypoints/openai/engine/serving.py
@@ -791,11 +791,15 @@ class OpenAIServing:

        if max_tokens is not None and token_num + max_tokens > max_model_len:
            raise VLLMValidationError(
-                "'max_tokens' or 'max_completion_tokens' is too large: "
-                f"{max_tokens}. This model's maximum context length is "
-                f"{max_model_len} tokens and your request has "
-                f"{token_num} input tokens ({max_tokens} > {max_model_len}"
-                f" - {token_num}).",
+                f"This model's maximum context length is "
+                f"{max_model_len} tokens. However, you requested "
+                f"{max_tokens} output tokens and your prompt contains "
+                f"{token_num} input tokens, for a total of "
+                f"{token_num + max_tokens} tokens "
+                f"({token_num} + {max_tokens} = "
+                f"{token_num + max_tokens} > {max_model_len}). "
+                f"Please reduce the length of the input prompt or the "
+                f"number of requested output tokens.",
                parameter="max_tokens",
                value=max_tokens,
            )
--- a/vllm/renderers/params.py
+++ b/vllm/renderers/params.py
@@ -253,13 +253,14 @@ class TokenizeParams:
                # To save resources, fail the request outright without even
                # attempting tokenization
                raise VLLMValidationError(
-                    f"You passed {len(text)} input characters "
-                    f"and requested {self.max_output_tokens} output tokens. "
-                    f"However, the model's context length is only "
-                    f"{self.max_total_tokens} tokens, resulting in a maximum "
-                    f"input length of {max_input_tokens} tokens "
-                    f"(at most {max_input_chars} characters). "
-                    f"Please reduce the length of the input prompt.",
+                    f"This model's maximum context length is "
+                    f"{self.max_total_tokens} tokens. However, you requested "
+                    f"{self.max_output_tokens} output tokens and your prompt "
+                    f"contains {len(text)} characters (more than "
+                    f"{max_input_chars} characters, which is the upper bound "
+                    f"for {max_input_tokens} input tokens). "
+                    f"Please reduce the length of the input prompt or the "
+                    f"number of requested output tokens.",
                    parameter="input_text",
                    value=len(text),
                )
@@ -334,15 +335,22 @@ class TokenizeParams:
            return tokens

        if len(tokens) > max_input_tokens:
+            token_count = len(tokens)
+            # The tokenizer may have truncated the prompt to
+            # max_input_tokens + 1 (see get_encode_kwargs), so the
+            # actual prompt length could be larger.
+            qualifier = "at least " if token_count == max_input_tokens + 1 else ""
+            total = token_count + self.max_output_tokens
            raise VLLMValidationError(
-                f"You passed {len(tokens)} input tokens "
-                f"and requested {self.max_output_tokens} output tokens. "
-                f"However, the model's context length is only "
-                f"{self.max_total_tokens} tokens, resulting in a maximum "
-                f"input length of {max_input_tokens} tokens. "
-                f"Please reduce the length of the input prompt.",
+                f"This model's maximum context length is "
+                f"{self.max_total_tokens} tokens. However, you requested "
+                f"{self.max_output_tokens} output tokens and your prompt "
+                f"contains {qualifier}{token_count} input tokens, "
+                f"for a total of {qualifier}{total} tokens. "
+                f"Please reduce the length of the input prompt or the "
+                f"number of requested output tokens.",
                parameter="input_tokens",
-                value=len(tokens),
+                value=token_count,
            )

        return tokens