[BugFix] --max-model-len=-1 causes over-limit requests to hang and starve the entire service (#39102)

Signed-off-by: triangle14 <y1019026570@gmail.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
triangleXIV
2026-04-09 05:03:17 +08:00
committed by GitHub
parent ad05edfbca
commit 7c94ae16c6
5 changed files with 82 additions and 4 deletions

View File

@@ -15,6 +15,7 @@ import pytest
from tests.conftest import VllmRunner
from tests.utils import create_new_process_for_each_test
from vllm.exceptions import VLLMValidationError
@create_new_process_for_each_test()
@@ -61,3 +62,42 @@ def test_decoder_max_context_length_validation(
with pytest.raises(ValueError) as excinfo:
vllm_model.generate_greedy(prompt_ids, max_tokens)
assert expected_msg in str(excinfo.value)
@create_new_process_for_each_test()
@pytest.mark.parametrize("model", ["JackFram/llama-160m"])
def test_auto_fit_max_model_len_rejects_oversized_input(
model: str,
vllm_runner: type[VllmRunner],
) -> None:
"""When max_model_len='auto' and KV cache memory is very limited,
the engine auto-fits max_model_len to a small value. The frontend
must see this reduced value and reject prompts that exceed it,
rather than accepting them and hanging."""
# Use a tiny KV cache budget to force auto-fit to a very small
# max_model_len (e.g. ~16 tokens).
kv_cache_bytes = 1_000_000 # 1 MB
with vllm_runner(
model_name=model,
max_model_len=-1,
max_num_seqs=1,
enforce_eager=True,
kv_cache_memory_bytes=kv_cache_bytes,
load_format="dummy",
) as vllm_model:
auto_fitted_len = (
vllm_model.llm.llm_engine.vllm_config.model_config.max_model_len
)
# Sanity check: auto-fit should have reduced it well below the
# model's native context length.
assert auto_fitted_len < 2048, (
f"Expected auto-fit to reduce max_model_len significantly, "
f"but got {auto_fitted_len}"
)
# A prompt longer than the auto-fitted length must be rejected.
oversized_prompt = [[43] * (auto_fitted_len + 10)]
with pytest.raises(VLLMValidationError, match="Please reduce the length"):
vllm_model.generate_greedy(oversized_prompt, max_tokens=4)

View File

@@ -114,7 +114,7 @@ def test_mp_client_uses_env_timeout(monkeypatch: pytest.MonkeyPatch):
return 1
def recv_multipart(self):
return (b"\x00\x00", b"ready")
return (b"\x00\x00", b"")
class DummySocket:
def send_multipart(self, _msg, *, copy: bool = False, track: bool = False):