2025-02-02 14:58:18 -05:00
# SPDX-License-Identifier: Apache-2.0
2025-06-03 11:20:17 -07:00
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
2025-02-02 14:58:18 -05:00
2025-03-03 01:34:51 +00:00
from collections . abc import Generator
from typing import Any
2023-09-13 13:38:01 -07:00
2024-03-25 23:59:47 +09:00
import pytest
2025-04-17 07:45:24 -07:00
from transformers import AutoTokenizer , PreTrainedTokenizer , PreTrainedTokenizerFast
2023-09-13 13:38:01 -07:00
2025-09-21 08:52:15 -07:00
from vllm . sampling_params import SamplingParams
2024-11-01 11:33:15 -06:00
from vllm . transformers_utils . tokenizers . mistral import MistralTokenizer
2025-04-17 07:45:24 -07:00
from vllm . v1 . engine import EngineCoreRequest
from vllm . v1 . engine . detokenizer import (
FastIncrementalDetokenizer ,
IncrementalDetokenizer ,
SlowIncrementalDetokenizer ,
)
SPECIAL_TOKS_TRUTH = [
" Some text with adjacent special tokens <|padding|><|padding|><fim_prefix><fim_middle><fim_suffix>other text<fim_pad> " , # noqa
]
2023-09-13 13:38:01 -07:00
TRUTH = [
2024-03-22 13:44:12 -07:00
" Hello here, this is a simple test " ,
" vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving " , # noqa
2024-11-01 11:33:15 -06:00
" 我很感谢你的热情 " ,
# Burmese text triggers an edge-case for Mistral's V3-Tekken tokenizer (eg.
# for mistralai/Pixtral-12B-2409) where tokens may map to bytes with
# incomplete UTF-8 characters
# see https://github.com/vllm-project/vllm/pull/9625
" ပုံပြင်လေးပြောပြပါ် " ,
2025-04-17 07:45:24 -07:00
] + SPECIAL_TOKS_TRUTH
2023-09-13 13:38:01 -07:00
TOKENIZERS = [
" facebook/opt-125m " ,
" gpt2 " ,
" bigcode/tiny_starcoder_py " ,
" EleutherAI/gpt-j-6b " ,
" EleutherAI/pythia-70m " ,
" bigscience/bloom-560m " ,
" mosaicml/mpt-7b " ,
" tiiuae/falcon-7b " ,
2025-02-14 06:18:03 +00:00
" meta-llama/Llama-3.2-1B-Instruct " ,
2023-09-13 13:38:01 -07:00
" codellama/CodeLlama-7b-hf " ,
2024-11-01 11:33:15 -06:00
" mistralai/Pixtral-12B-2409 " ,
2023-09-13 13:38:01 -07:00
]
2025-04-17 07:45:24 -07:00
def _run_incremental_decode (
tokenizer ,
all_input_ids ,
skip_special_tokens : bool ,
starting_index : int ,
spaces_between_special_tokens : bool = True ,
fast : bool | None = None ,
) :
prompt_token_ids = all_input_ids [ : starting_index ]
params = SamplingParams (
skip_special_tokens = skip_special_tokens ,
spaces_between_special_tokens = spaces_between_special_tokens ,
)
2025-09-16 16:06:56 -05:00
request = EngineCoreRequest (
request_id = " " ,
prompt_token_ids = prompt_token_ids ,
mm_features = None ,
sampling_params = params ,
pooling_params = None ,
eos_token_id = None ,
arrival_time = 0.0 ,
lora_request = None ,
2025-06-04 08:26:47 -07:00
cache_salt = None ,
data_parallel_rank = None ,
)
2025-04-17 07:45:24 -07:00
if fast is None :
detokenizer = IncrementalDetokenizer . from_new_request ( tokenizer , request )
elif fast :
detokenizer = FastIncrementalDetokenizer ( tokenizer , request )
else :
detokenizer = SlowIncrementalDetokenizer ( tokenizer , request )
output_text = " "
for i , token_id in enumerate ( all_input_ids [ starting_index : ] ) :
detokenizer . update ( [ token_id ] , False )
finished = i == len ( all_input_ids ) - 1
output_text + = detokenizer . get_next_output_text ( finished , delta = True )
return output_text , detokenizer . output_token_ids
2023-09-13 13:38:01 -07:00
2024-11-01 11:33:15 -06:00
@pytest.fixture
def tokenizer ( tokenizer_name ) :
return (
MistralTokenizer . from_pretrained ( tokenizer_name )
if " mistral " in tokenizer_name
else AutoTokenizer . from_pretrained ( tokenizer_name )
2025-10-05 15:06:22 +01:00
)
2024-11-01 11:33:15 -06:00
@pytest.mark.parametrize ( " tokenizer_name " , [ " mistralai/Pixtral-12B-2409 " ] )
@pytest.mark.parametrize (
" truth " ,
[
# Burmese text triggers an edge-case where tokens may map to bytes with
# incomplete UTF-8 characters
" ပုံပြင်လေးပြောပြပါ " ,
# Using "URGENCY" since "CY" has token id 130282
" URGENCY🌶️ " ,
] ,
)
def test_mistral_edge_case ( tokenizer , truth ) :
""" Test for a specific edge cases with V3-Tekken MistralTokenizer.
See https : / / github . com / vllm - project / vllm / pull / 9625
"""
starting_index = 0
all_input_ids = tokenizer ( truth , add_special_tokens = False ) . input_ids
2025-04-17 07:45:24 -07:00
decoded_text , out_ids = _run_incremental_decode (
tokenizer ,
all_input_ids ,
skip_special_tokens = True ,
starting_index = starting_index ,
)
2024-11-01 11:33:15 -06:00
assert decoded_text == truth
2025-04-17 07:45:24 -07:00
assert out_ids == all_input_ids [ starting_index : ]
2024-11-01 11:33:15 -06:00
@pytest.fixture
def skip_special_tokens ( request , tokenizer_name ) - > Generator [ bool , Any , None ] :
if " mistral " in tokenizer_name :
yield (
2024-11-06 02:11:55 -05:00
True
if request . param
2024-11-01 11:33:15 -06:00
else pytest . skip ( " mistral doesn ' t support skip_special_tokens=False " )
2025-10-05 15:06:22 +01:00
)
2024-11-01 11:33:15 -06:00
else :
2024-11-06 02:11:55 -05:00
yield bool ( request . param )
2024-11-01 11:33:15 -06:00
2023-09-13 13:38:01 -07:00
@pytest.mark.parametrize ( " truth " , TRUTH )
2024-03-22 13:44:12 -07:00
@pytest.mark.parametrize ( " with_prompt " , [ True , False ] )
2024-11-01 11:33:15 -06:00
@pytest.mark.parametrize ( " tokenizer_name " , TOKENIZERS )
@pytest.mark.parametrize ( " skip_special_tokens " , ( True , False ) , indirect = True )
2025-04-17 07:45:24 -07:00
@pytest.mark.parametrize ( " spaces_between_special_tokens " , ( True , False ) )
@pytest.mark.parametrize ( " fast " , ( True , False ) )
def test_decode_streaming (
tokenizer ,
truth ,
with_prompt ,
skip_special_tokens ,
spaces_between_special_tokens ,
fast ,
) :
if fast and not isinstance ( tokenizer , PreTrainedTokenizerFast ) :
pytest . skip ( )
if skip_special_tokens and not spaces_between_special_tokens :
pytest . skip ( )
if not fast and isinstance ( tokenizer , PreTrainedTokenizerFast ) :
# Fix up inconsistency in fast/slow tokenizer behaviour.
tokenizer . add_special_tokens (
{
" additional_special_tokens " : [
at
for at in tokenizer . _tokenizer . get_added_tokens_decoder ( ) . values ( )
if at . special
]
}
)
2025-10-05 15:06:22 +01:00
2025-04-17 07:45:24 -07:00
extra_decode_args = (
{ }
if not isinstance ( tokenizer , PreTrainedTokenizer )
else { " spaces_between_special_tokens " : spaces_between_special_tokens }
2025-10-05 15:06:22 +01:00
)
2025-04-17 07:45:24 -07:00
truth_tokens = tokenizer ( truth , add_special_tokens = False ) . input_ids
if tokenizer . bos_token_id is not None :
truth_tokens . insert ( 0 , tokenizer . bos_token_id )
truth_tokens . append ( tokenizer . eos_token_id )
new_truth = tokenizer . decode (
truth_tokens , skip_special_tokens = skip_special_tokens , * * extra_decode_args
)
2024-03-22 13:44:12 -07:00
if with_prompt :
2025-04-17 07:45:24 -07:00
num_prompt_tokens = len (
tokenizer ( truth [ : len ( truth ) / / 2 ] , add_special_tokens = False ) . input_ids
2025-10-05 15:06:22 +01:00
)
2025-04-17 07:45:24 -07:00
if tokenizer . bos_token_id is not None :
num_prompt_tokens + = 1
prompt_input_ids = truth_tokens [ : num_prompt_tokens ]
generated_input_ids = truth_tokens [ num_prompt_tokens : ]
2024-03-22 13:44:12 -07:00
all_input_ids = prompt_input_ids + generated_input_ids
starting_index = len ( prompt_input_ids )
prompt = tokenizer . decode (
prompt_input_ids ,
2025-04-17 07:45:24 -07:00
skip_special_tokens = skip_special_tokens ,
* * extra_decode_args ,
)
generated = new_truth [ len ( prompt ) : ]
2024-03-22 13:44:12 -07:00
else :
2025-04-17 07:45:24 -07:00
generated = new_truth
2024-03-22 13:44:12 -07:00
starting_index = 0
2025-04-17 07:45:24 -07:00
all_input_ids = truth_tokens
2023-09-13 13:38:01 -07:00
2025-04-17 07:45:24 -07:00
decoded_text , out_ids = _run_incremental_decode (
2024-03-22 13:44:12 -07:00
tokenizer ,
all_input_ids ,
skip_special_tokens = skip_special_tokens ,
2025-04-17 07:45:24 -07:00
starting_index = starting_index ,
spaces_between_special_tokens = spaces_between_special_tokens ,
fast = fast ,
)
2023-09-13 13:38:01 -07:00
2024-03-22 13:44:12 -07:00
assert decoded_text == generated
2025-04-17 07:45:24 -07:00
assert out_ids == all_input_ids [ starting_index : ]
2024-03-22 13:44:12 -07:00
2025-04-17 07:45:24 -07:00
@pytest.mark.parametrize ( " tokenizer_name " , TOKENIZERS )
@pytest.mark.parametrize ( " fast " , ( True , False ) )
def test_oov_decode ( tokenizer , fast ) :
if fast and not isinstance ( tokenizer , PreTrainedTokenizerFast ) :
pytest . skip ( )
decoded_text , out_ids = _run_incremental_decode (
2024-03-29 23:18:59 +08:00
tokenizer ,
[ len ( tokenizer ) ] ,
2025-04-17 07:45:24 -07:00
skip_special_tokens = True ,
starting_index = 0 ,
spaces_between_special_tokens = True ,
fast = fast ,
)
2024-03-29 23:18:59 +08:00
assert decoded_text == " "
2025-04-17 07:45:24 -07:00
assert out_ids == [ len ( tokenizer ) ]