2025-02-02 14:58:18 -05:00
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
2025-06-03 11:20:17 -07:00
|
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
2025-02-02 14:58:18 -05:00
|
|
|
|
2024-03-25 23:59:47 +09:00
|
|
|
import pytest
|
2024-03-15 16:37:01 -07:00
|
|
|
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
2024-03-25 23:59:47 +09:00
|
|
|
|
2025-04-24 12:43:56 +01:00
|
|
|
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
2024-07-19 18:25:06 -07:00
|
|
|
|
|
|
|
|
|
2024-03-15 16:37:01 -07:00
|
|
|
@pytest.mark.asyncio
|
2025-04-24 12:43:56 +01:00
|
|
|
async def test_tokenizer_group():
|
2024-03-15 16:37:01 -07:00
|
|
|
reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
2025-04-24 12:43:56 +01:00
|
|
|
tokenizer_group = TokenizerGroup(
|
2024-03-15 16:37:01 -07:00
|
|
|
tokenizer_id="gpt2",
|
|
|
|
|
enable_lora=False,
|
|
|
|
|
max_num_seqs=1,
|
|
|
|
|
max_input_length=None,
|
|
|
|
|
)
|
|
|
|
|
assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
|
2025-03-20 22:24:10 -07:00
|
|
|
prompt="prompt", lora_request=None)
|
2024-03-15 16:37:01 -07:00
|
|
|
assert reference_tokenizer.encode(
|
2025-03-20 22:24:10 -07:00
|
|
|
"prompt") == await tokenizer_group.encode_async(prompt="prompt",
|
|
|
|
|
lora_request=None)
|
2024-03-15 16:37:01 -07:00
|
|
|
assert isinstance(tokenizer_group.get_lora_tokenizer(None),
|
|
|
|
|
PreTrainedTokenizerBase)
|
|
|
|
|
assert tokenizer_group.get_lora_tokenizer(
|
|
|
|
|
None) == await tokenizer_group.get_lora_tokenizer_async(None)
|