[BugFix] --max-model-len=-1 causes over-limit requests to hang and starve the entire service (#39102)
Signed-off-by: triangle14 <y1019026570@gmail.com> Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
@@ -15,6 +15,7 @@ import pytest
|
||||
|
||||
from tests.conftest import VllmRunner
|
||||
from tests.utils import create_new_process_for_each_test
|
||||
from vllm.exceptions import VLLMValidationError
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
@@ -61,3 +62,42 @@ def test_decoder_max_context_length_validation(
|
||||
with pytest.raises(ValueError) as excinfo:
|
||||
vllm_model.generate_greedy(prompt_ids, max_tokens)
|
||||
assert expected_msg in str(excinfo.value)
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
@pytest.mark.parametrize("model", ["JackFram/llama-160m"])
|
||||
def test_auto_fit_max_model_len_rejects_oversized_input(
|
||||
model: str,
|
||||
vllm_runner: type[VllmRunner],
|
||||
) -> None:
|
||||
"""When max_model_len='auto' and KV cache memory is very limited,
|
||||
the engine auto-fits max_model_len to a small value. The frontend
|
||||
must see this reduced value and reject prompts that exceed it,
|
||||
rather than accepting them and hanging."""
|
||||
|
||||
# Use a tiny KV cache budget to force auto-fit to a very small
|
||||
# max_model_len (e.g. ~16 tokens).
|
||||
kv_cache_bytes = 1_000_000 # 1 MB
|
||||
|
||||
with vllm_runner(
|
||||
model_name=model,
|
||||
max_model_len=-1,
|
||||
max_num_seqs=1,
|
||||
enforce_eager=True,
|
||||
kv_cache_memory_bytes=kv_cache_bytes,
|
||||
load_format="dummy",
|
||||
) as vllm_model:
|
||||
auto_fitted_len = (
|
||||
vllm_model.llm.llm_engine.vllm_config.model_config.max_model_len
|
||||
)
|
||||
# Sanity check: auto-fit should have reduced it well below the
|
||||
# model's native context length.
|
||||
assert auto_fitted_len < 2048, (
|
||||
f"Expected auto-fit to reduce max_model_len significantly, "
|
||||
f"but got {auto_fitted_len}"
|
||||
)
|
||||
|
||||
# A prompt longer than the auto-fitted length must be rejected.
|
||||
oversized_prompt = [[43] * (auto_fitted_len + 10)]
|
||||
with pytest.raises(VLLMValidationError, match="Please reduce the length"):
|
||||
vllm_model.generate_greedy(oversized_prompt, max_tokens=4)
|
||||
|
||||
@@ -114,7 +114,7 @@ def test_mp_client_uses_env_timeout(monkeypatch: pytest.MonkeyPatch):
|
||||
return 1
|
||||
|
||||
def recv_multipart(self):
|
||||
return (b"\x00\x00", b"ready")
|
||||
return (b"\x00\x00", b"")
|
||||
|
||||
class DummySocket:
|
||||
def send_multipart(self, _msg, *, copy: bool = False, track: bool = False):
|
||||
|
||||
Reference in New Issue
Block a user