Files
vllm/tests/entrypoints/openai/test_serving_engine.py

72 lines
2.1 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import time
from unittest.mock import Mock
import pytest
from vllm.config import ModelConfig
from vllm.entrypoints.openai.engine.serving import OpenAIServing
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.tokenizers.mistral import MistralTokenizer
@pytest.fixture()
def serving() -> OpenAIServing:
"""Create a minimal OpenAIServing instance for testing."""
# Create minimal mocks
engine_client = Mock()
model_config = Mock(spec=ModelConfig)
model_config.max_model_len = 32768
models = Mock(spec=OpenAIServingModels)
models.model_config = model_config
models.input_processor = Mock()
models.io_processor = Mock()
serving = OpenAIServing(
engine_client=engine_client,
models=models,
request_logger=None,
)
return serving
@pytest.mark.asyncio
async def test_async_mistral_tokenizer_does_not_block_event_loop(
serving: OpenAIServing,
):
expected_tokens = [1, 2, 3]
# Mock the blocking version to sleep
def mocked_apply_chat_template(*_args, **_kwargs):
time.sleep(2)
return expected_tokens
mock_tokenizer = Mock(spec=MistralTokenizer)
mock_tokenizer.apply_chat_template.side_effect = mocked_apply_chat_template
task = serving._apply_mistral_chat_template_async(
tokenizer=mock_tokenizer, messages=[], chat_template=None, tools=[]
)
# Ensure the event loop is not blocked
blocked_count = 0
for _i in range(20): # Check over ~2 seconds
start = time.perf_counter()
await asyncio.sleep(0)
elapsed = time.perf_counter() - start
# an overly generous elapsed time for slow machines
if elapsed >= 0.5:
blocked_count += 1
await asyncio.sleep(0.1)
# Ensure task completes
tokens = await task
assert tokens == expected_tokens, "Mocked blocking tokenizer was not called"
assert blocked_count == 0, "Event loop blocked during tokenization"