[BugFix] Handle errors when preprocessing added requests (#30895)
Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
56
tests/v1/engine/test_preprocess_error_handling.py
Normal file
56
tests/v1/engine/test_preprocess_error_handling.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch.cuda
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.core import EngineCore
|
||||
|
||||
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
|
||||
|
||||
|
||||
def test_preprocess_error_handling(monkeypatch: pytest.MonkeyPatch):
|
||||
"""Test that preprocessing errors are handled gracefully."""
|
||||
|
||||
assert not torch.cuda.is_initialized(), (
|
||||
"fork needs to be used for the engine "
|
||||
"core process and this isn't possible if cuda is already initialized"
|
||||
)
|
||||
|
||||
# Store original method to call for non-failing requests
|
||||
original_preprocess = EngineCore.preprocess_add_request
|
||||
|
||||
# Monkeypatch to make preprocess_add_request raise an exception
|
||||
# only for requests with "FAIL" in the first token
|
||||
def conditional_failing_preprocess(self, request: EngineCoreRequest):
|
||||
# Fail if the first token id is 333
|
||||
if request.prompt_token_ids and request.prompt_token_ids[0] == 333:
|
||||
raise ValueError("Simulated preprocessing error!")
|
||||
return original_preprocess(self, request)
|
||||
|
||||
monkeypatch.setattr(
|
||||
EngineCore, "preprocess_add_request", conditional_failing_preprocess
|
||||
)
|
||||
|
||||
llm = LLM(model=MODEL_NAME)
|
||||
|
||||
# Create a failing request by crafting a request with an invalid token
|
||||
# We need to use a direct approach since LLM.generate tokenizes for us
|
||||
from vllm.inputs import TokensPrompt
|
||||
|
||||
# This should raise an exception due to the preprocessing failure
|
||||
# Special token id to trigger the failure
|
||||
failing_prompt = TokensPrompt(prompt_token_ids=[333])
|
||||
outputs = llm.generate(failing_prompt, SamplingParams(max_tokens=10)) # type: ignore
|
||||
assert len(outputs) == 1
|
||||
assert len(outputs[0].outputs[0].token_ids) == 0
|
||||
assert outputs[0].finished
|
||||
assert outputs[0].outputs[0].finish_reason == "error"
|
||||
|
||||
# Verify the engine is still functional with a normal request
|
||||
outputs = llm.generate("Hello, my name is", SamplingParams(max_tokens=10))
|
||||
assert len(outputs) == 1
|
||||
assert len(outputs[0].outputs[0].token_ids) > 0
|
||||
assert outputs[0].outputs[0].finish_reason in ("stop", "length")
|
||||
Reference in New Issue
Block a user