[BugFix] --max-model-len=-1 causes over-limit requests to hang and starve the entire service (#39102)

Signed-off-by: triangle14 <y1019026570@gmail.com> Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: mgoin <mgoin64@gmail.com>
2026-04-09 05:03:17 +08:00
parent ad05edfbca
commit 7c94ae16c6
5 changed files with 82 additions and 4 deletions
--- a/tests/v1/e2e/general/test_context_length.py
+++ b/tests/v1/e2e/general/test_context_length.py
@@ -15,6 +15,7 @@ import pytest

 from tests.conftest import VllmRunner
 from tests.utils import create_new_process_for_each_test
+from vllm.exceptions import VLLMValidationError


@create_new_process_for_each_test()
@@ -61,3 +62,42 @@ def test_decoder_max_context_length_validation(
            with pytest.raises(ValueError) as excinfo:
                vllm_model.generate_greedy(prompt_ids, max_tokens)
            assert expected_msg in str(excinfo.value)
+
+
+@create_new_process_for_each_test()
+@pytest.mark.parametrize("model", ["JackFram/llama-160m"])
+def test_auto_fit_max_model_len_rejects_oversized_input(
+    model: str,
+    vllm_runner: type[VllmRunner],
+) -> None:
+    """When max_model_len='auto' and KV cache memory is very limited,
+    the engine auto-fits max_model_len to a small value. The frontend
+    must see this reduced value and reject prompts that exceed it,
+    rather than accepting them and hanging."""
+
+    # Use a tiny KV cache budget to force auto-fit to a very small
+    # max_model_len (e.g. ~16 tokens).
+    kv_cache_bytes = 1_000_000  # 1 MB
+
+    with vllm_runner(
+        model_name=model,
+        max_model_len=-1,
+        max_num_seqs=1,
+        enforce_eager=True,
+        kv_cache_memory_bytes=kv_cache_bytes,
+        load_format="dummy",
+    ) as vllm_model:
+        auto_fitted_len = (
+            vllm_model.llm.llm_engine.vllm_config.model_config.max_model_len
+        )
+        # Sanity check: auto-fit should have reduced it well below the
+        # model's native context length.
+        assert auto_fitted_len < 2048, (
+            f"Expected auto-fit to reduce max_model_len significantly, "
+            f"but got {auto_fitted_len}"
+        )
+
+        # A prompt longer than the auto-fitted length must be rejected.
+        oversized_prompt = [[43] * (auto_fitted_len + 10)]
+        with pytest.raises(VLLMValidationError, match="Please reduce the length"):
+            vllm_model.generate_greedy(oversized_prompt, max_tokens=4)
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -114,7 +114,7 @@ def test_mp_client_uses_env_timeout(monkeypatch: pytest.MonkeyPatch):
            return 1

        def recv_multipart(self):
-            return (b"\x00\x00", b"ready")
+            return (b"\x00\x00", b"")

    class DummySocket:
        def send_multipart(self, _msg, *, copy: bool = False, track: bool = False):