[Misc] Update TokenizerLike interface and move get_cached_tokenizer (#29730)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-11-30 14:59:47 +08:00
committed by GitHub
parent 9381b5cde0
commit 2afcec4dec
15 changed files with 260 additions and 174 deletions

View File

@@ -7,7 +7,7 @@ import pytest
from transformers import AutoTokenizer
from vllm.tokenizers import TokenizerLike
from vllm.transformers_utils.tokenizer import get_cached_tokenizer
from vllm.tokenizers.hf import get_cached_tokenizer
@pytest.mark.parametrize("model_id", ["gpt2", "zai-org/chatglm3-6b"])

View File

@@ -356,8 +356,8 @@ class TestMistralTokenizer:
)
attn_mask = [1 for _ in range(len(token_ids))]
# Test 1: default
assert mistral_tokenizer("Hello world !") == {
# Test 1: no special tokens
assert mistral_tokenizer("Hello world !", add_special_tokens=False) == {
"attention_mask": attn_mask[1:],
"input_ids": token_ids[1:],
}
@@ -381,7 +381,7 @@ class TestMistralTokenizer:
"input_ids": token_ids,
}
# Test 5: empty string
assert mistral_tokenizer("") == {
assert mistral_tokenizer("", add_special_tokens=False) == {
"attention_mask": [],
"input_ids": [],
}

View File

@@ -17,20 +17,26 @@ class TestTokenizer(TokenizerLike):
def eos_token_id(self) -> int:
return 1
@property
def pad_token_id(self) -> int:
return 2
@property
def is_fast(self) -> bool:
return True
def test_customized_tokenizer():
TokenizerRegistry.register(
"test_tokenizer",
__name__,
TestTokenizer.__name__,
)
TokenizerRegistry.register("test_tokenizer", __name__, TestTokenizer.__name__)
tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer")
assert isinstance(tokenizer, TestTokenizer)
assert tokenizer.bos_token_id == 0
assert tokenizer.eos_token_id == 1
assert tokenizer.pad_token_id == 2
tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom")
assert isinstance(tokenizer, TestTokenizer)
assert tokenizer.bos_token_id == 0
assert tokenizer.eos_token_id == 1
assert tokenizer.pad_token_id == 2