101 lines
3.2 KiB
Python
101 lines
3.2 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
import asyncio
|
|
import time
|
|
from unittest.mock import Mock
|
|
|
|
import pytest
|
|
from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
|
|
|
|
from vllm.config import ModelConfig
|
|
from vllm.renderers.mistral import MistralRenderer, safe_apply_chat_template
|
|
from vllm.tokenizers.mistral import MistralTokenizer
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_async_mistral_tokenizer_does_not_block_event_loop():
|
|
expected_tokens = [1, 2, 3]
|
|
|
|
# Mock the blocking version to sleep
|
|
def mocked_apply_chat_template(*_args, **_kwargs):
|
|
time.sleep(2)
|
|
return expected_tokens
|
|
|
|
mock_tokenizer = Mock(spec=MistralTokenizer)
|
|
mock_tokenizer.apply_chat_template = mocked_apply_chat_template
|
|
mock_renderer = MistralRenderer(Mock(spec=ModelConfig), tokenizer_kwargs={})
|
|
mock_renderer._tokenizer = mock_tokenizer
|
|
|
|
task = mock_renderer.render_messages_async([])
|
|
|
|
# Ensure the event loop is not blocked
|
|
blocked_count = 0
|
|
for _i in range(20): # Check over ~2 seconds
|
|
start = time.perf_counter()
|
|
await asyncio.sleep(0)
|
|
elapsed = time.perf_counter() - start
|
|
|
|
# an overly generous elapsed time for slow machines
|
|
if elapsed >= 0.5:
|
|
blocked_count += 1
|
|
|
|
await asyncio.sleep(0.1)
|
|
|
|
# Ensure task completes
|
|
_, prompt = await task
|
|
assert prompt["prompt_token_ids"] == expected_tokens, (
|
|
"Mocked blocking tokenizer was not called"
|
|
)
|
|
assert blocked_count == 0, "Event loop blocked during tokenization"
|
|
|
|
|
|
def test_apply_mistral_chat_template_thinking_chunk():
|
|
messages = [
|
|
{
|
|
"role": "system",
|
|
"content": [
|
|
{"type": "text", "text": "You are a helpful assistant."},
|
|
{
|
|
"type": "thinking",
|
|
"closed": True,
|
|
"thinking": "Only return the answer when you are confident.",
|
|
},
|
|
],
|
|
},
|
|
{"role": "user", "content": "What is 2+2?"},
|
|
{
|
|
"role": "assistant",
|
|
"content": [
|
|
{"type": "text", "text": "Let me think about it."},
|
|
{"type": "thinking", "closed": True, "thinking": "2+2 = 4"},
|
|
{
|
|
"type": "text",
|
|
"text": "The answer is 4.",
|
|
},
|
|
],
|
|
},
|
|
{"role": "user", "content": "Thanks, what is 3+3?"},
|
|
]
|
|
mistral_tokenizer = MistralTokenizer.from_pretrained(
|
|
"mistralai/Magistral-Small-2509"
|
|
)
|
|
|
|
tokens_ids = safe_apply_chat_template(
|
|
mistral_tokenizer, messages, chat_template=None, tools=None
|
|
)
|
|
|
|
string_tokens = mistral_tokenizer.mistral.decode(
|
|
tokens_ids, special_token_policy=SpecialTokenPolicy.KEEP
|
|
)
|
|
|
|
expected_tokens = (
|
|
r"<s>[SYSTEM_PROMPT]You are a helpful assistant.[THINK]Only return the"
|
|
r" answer when you are confident.[/THINK][/SYSTEM_PROMPT]"
|
|
r"[INST]What is 2+2?[/INST]"
|
|
r"Let me think about it.[THINK]2+2 = 4[/THINK]The answer is 4.</s>"
|
|
r"[INST]Thanks, what is 3+3?[/INST]"
|
|
)
|
|
|
|
assert string_tokens == expected_tokens
|