diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py index ba3b80320..20ed73e26 100644 --- a/tests/entrypoints/llm/test_chat.py +++ b/tests/entrypoints/llm/test_chat.py @@ -200,7 +200,7 @@ def test_chat_batch_failure_cleanup(llm_for_failure_test): batch_2 = [valid_msg, valid_msg] sampling_params = SamplingParams(temperature=0, max_tokens=10) - with pytest.raises(ValueError, match="context length is only"): + with pytest.raises(ValueError, match="maximum context length is"): llm.chat(batch_1, sampling_params=sampling_params) assert llm.llm_engine.get_num_unfinished_requests() == 0 diff --git a/tests/renderers/test_completions.py b/tests/renderers/test_completions.py index 492f539e4..e15eae626 100644 --- a/tests/renderers/test_completions.py +++ b/tests/renderers/test_completions.py @@ -271,7 +271,7 @@ class TestRenderPrompt: with pytest.raises( ValueError, - match="input characters and requested .* context length is only", + match="maximum context length is", ): renderer.tokenize_prompts( prompts, @@ -292,7 +292,7 @@ class TestRenderPrompt: with pytest.raises( ValueError, - match="input tokens and requested .* context length is only", + match="maximum context length is", ): renderer.tokenize_prompts( prompts, @@ -313,7 +313,7 @@ class TestRenderPrompt: with pytest.raises( ValueError, - match="input tokens and requested .* context length is only", + match="maximum context length is", ): renderer.tokenize_prompts( prompts, diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py index 44954ef9d..f52cd1725 100644 --- a/vllm/entrypoints/openai/engine/serving.py +++ b/vllm/entrypoints/openai/engine/serving.py @@ -791,11 +791,15 @@ class OpenAIServing: if max_tokens is not None and token_num + max_tokens > max_model_len: raise VLLMValidationError( - "'max_tokens' or 'max_completion_tokens' is too large: " - f"{max_tokens}. This model's maximum context length is " - f"{max_model_len} tokens and your request has " - f"{token_num} input tokens ({max_tokens} > {max_model_len}" - f" - {token_num}).", + f"This model's maximum context length is " + f"{max_model_len} tokens. However, you requested " + f"{max_tokens} output tokens and your prompt contains " + f"{token_num} input tokens, for a total of " + f"{token_num + max_tokens} tokens " + f"({token_num} + {max_tokens} = " + f"{token_num + max_tokens} > {max_model_len}). " + f"Please reduce the length of the input prompt or the " + f"number of requested output tokens.", parameter="max_tokens", value=max_tokens, ) diff --git a/vllm/renderers/params.py b/vllm/renderers/params.py index 52a7b9675..3ce7cf5e1 100644 --- a/vllm/renderers/params.py +++ b/vllm/renderers/params.py @@ -253,13 +253,14 @@ class TokenizeParams: # To save resources, fail the request outright without even # attempting tokenization raise VLLMValidationError( - f"You passed {len(text)} input characters " - f"and requested {self.max_output_tokens} output tokens. " - f"However, the model's context length is only " - f"{self.max_total_tokens} tokens, resulting in a maximum " - f"input length of {max_input_tokens} tokens " - f"(at most {max_input_chars} characters). " - f"Please reduce the length of the input prompt.", + f"This model's maximum context length is " + f"{self.max_total_tokens} tokens. However, you requested " + f"{self.max_output_tokens} output tokens and your prompt " + f"contains {len(text)} characters (more than " + f"{max_input_chars} characters, which is the upper bound " + f"for {max_input_tokens} input tokens). " + f"Please reduce the length of the input prompt or the " + f"number of requested output tokens.", parameter="input_text", value=len(text), ) @@ -334,15 +335,22 @@ class TokenizeParams: return tokens if len(tokens) > max_input_tokens: + token_count = len(tokens) + # The tokenizer may have truncated the prompt to + # max_input_tokens + 1 (see get_encode_kwargs), so the + # actual prompt length could be larger. + qualifier = "at least " if token_count == max_input_tokens + 1 else "" + total = token_count + self.max_output_tokens raise VLLMValidationError( - f"You passed {len(tokens)} input tokens " - f"and requested {self.max_output_tokens} output tokens. " - f"However, the model's context length is only " - f"{self.max_total_tokens} tokens, resulting in a maximum " - f"input length of {max_input_tokens} tokens. " - f"Please reduce the length of the input prompt.", + f"This model's maximum context length is " + f"{self.max_total_tokens} tokens. However, you requested " + f"{self.max_output_tokens} output tokens and your prompt " + f"contains {qualifier}{token_count} input tokens, " + f"for a total of {qualifier}{total} tokens. " + f"Please reduce the length of the input prompt or the " + f"number of requested output tokens.", parameter="input_tokens", - value=len(tokens), + value=token_count, ) return tokens