diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py
index ba3b80320..20ed73e26 100644
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -200,7 +200,7 @@ def test_chat_batch_failure_cleanup(llm_for_failure_test):
     batch_2 = [valid_msg, valid_msg]
     sampling_params = SamplingParams(temperature=0, max_tokens=10)
 
-    with pytest.raises(ValueError, match="context length is only"):
+    with pytest.raises(ValueError, match="maximum context length is"):
         llm.chat(batch_1, sampling_params=sampling_params)
     assert llm.llm_engine.get_num_unfinished_requests() == 0
 
diff --git a/tests/renderers/test_completions.py b/tests/renderers/test_completions.py
index 492f539e4..e15eae626 100644
--- a/tests/renderers/test_completions.py
+++ b/tests/renderers/test_completions.py
@@ -271,7 +271,7 @@ class TestRenderPrompt:
 
         with pytest.raises(
             ValueError,
-            match="input characters and requested .* context length is only",
+            match="maximum context length is",
         ):
             renderer.tokenize_prompts(
                 prompts,
@@ -292,7 +292,7 @@ class TestRenderPrompt:
 
         with pytest.raises(
             ValueError,
-            match="input tokens and requested .* context length is only",
+            match="maximum context length is",
         ):
             renderer.tokenize_prompts(
                 prompts,
@@ -313,7 +313,7 @@ class TestRenderPrompt:
 
         with pytest.raises(
             ValueError,
-            match="input tokens and requested .* context length is only",
+            match="maximum context length is",
         ):
             renderer.tokenize_prompts(
                 prompts,
diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py
index 44954ef9d..f52cd1725 100644
--- a/vllm/entrypoints/openai/engine/serving.py
+++ b/vllm/entrypoints/openai/engine/serving.py
@@ -791,11 +791,15 @@ class OpenAIServing:
 
         if max_tokens is not None and token_num + max_tokens > max_model_len:
             raise VLLMValidationError(
-                "'max_tokens' or 'max_completion_tokens' is too large: "
-                f"{max_tokens}. This model's maximum context length is "
-                f"{max_model_len} tokens and your request has "
-                f"{token_num} input tokens ({max_tokens} > {max_model_len}"
-                f" - {token_num}).",
+                f"This model's maximum context length is "
+                f"{max_model_len} tokens. However, you requested "
+                f"{max_tokens} output tokens and your prompt contains "
+                f"{token_num} input tokens, for a total of "
+                f"{token_num + max_tokens} tokens "
+                f"({token_num} + {max_tokens} = "
+                f"{token_num + max_tokens} > {max_model_len}). "
+                f"Please reduce the length of the input prompt or the "
+                f"number of requested output tokens.",
                 parameter="max_tokens",
                 value=max_tokens,
             )
diff --git a/vllm/renderers/params.py b/vllm/renderers/params.py
index 52a7b9675..3ce7cf5e1 100644
--- a/vllm/renderers/params.py
+++ b/vllm/renderers/params.py
@@ -253,13 +253,14 @@ class TokenizeParams:
                 # To save resources, fail the request outright without even
                 # attempting tokenization
                 raise VLLMValidationError(
-                    f"You passed {len(text)} input characters "
-                    f"and requested {self.max_output_tokens} output tokens. "
-                    f"However, the model's context length is only "
-                    f"{self.max_total_tokens} tokens, resulting in a maximum "
-                    f"input length of {max_input_tokens} tokens "
-                    f"(at most {max_input_chars} characters). "
-                    f"Please reduce the length of the input prompt.",
+                    f"This model's maximum context length is "
+                    f"{self.max_total_tokens} tokens. However, you requested "
+                    f"{self.max_output_tokens} output tokens and your prompt "
+                    f"contains {len(text)} characters (more than "
+                    f"{max_input_chars} characters, which is the upper bound "
+                    f"for {max_input_tokens} input tokens). "
+                    f"Please reduce the length of the input prompt or the "
+                    f"number of requested output tokens.",
                     parameter="input_text",
                     value=len(text),
                 )
@@ -334,15 +335,22 @@ class TokenizeParams:
             return tokens
 
         if len(tokens) > max_input_tokens:
+            token_count = len(tokens)
+            # The tokenizer may have truncated the prompt to
+            # max_input_tokens + 1 (see get_encode_kwargs), so the
+            # actual prompt length could be larger.
+            qualifier = "at least " if token_count == max_input_tokens + 1 else ""
+            total = token_count + self.max_output_tokens
             raise VLLMValidationError(
-                f"You passed {len(tokens)} input tokens "
-                f"and requested {self.max_output_tokens} output tokens. "
-                f"However, the model's context length is only "
-                f"{self.max_total_tokens} tokens, resulting in a maximum "
-                f"input length of {max_input_tokens} tokens. "
-                f"Please reduce the length of the input prompt.",
+                f"This model's maximum context length is "
+                f"{self.max_total_tokens} tokens. However, you requested "
+                f"{self.max_output_tokens} output tokens and your prompt "
+                f"contains {qualifier}{token_count} input tokens, "
+                f"for a total of {qualifier}{total} tokens. "
+                f"Please reduce the length of the input prompt or the "
+                f"number of requested output tokens.",
                 parameter="input_tokens",
-                value=len(tokens),
+                value=token_count,
             )
 
         return tokens