[Misc] Update TokenizerLike interface and move get_cached_tokenizer (#29730)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -7,7 +7,7 @@ import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.transformers_utils.tokenizer import get_cached_tokenizer
|
||||
from vllm.tokenizers.hf import get_cached_tokenizer
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["gpt2", "zai-org/chatglm3-6b"])
|
||||
@@ -356,8 +356,8 @@ class TestMistralTokenizer:
|
||||
)
|
||||
attn_mask = [1 for _ in range(len(token_ids))]
|
||||
|
||||
# Test 1: default
|
||||
assert mistral_tokenizer("Hello world !") == {
|
||||
# Test 1: no special tokens
|
||||
assert mistral_tokenizer("Hello world !", add_special_tokens=False) == {
|
||||
"attention_mask": attn_mask[1:],
|
||||
"input_ids": token_ids[1:],
|
||||
}
|
||||
@@ -381,7 +381,7 @@ class TestMistralTokenizer:
|
||||
"input_ids": token_ids,
|
||||
}
|
||||
# Test 5: empty string
|
||||
assert mistral_tokenizer("") == {
|
||||
assert mistral_tokenizer("", add_special_tokens=False) == {
|
||||
"attention_mask": [],
|
||||
"input_ids": [],
|
||||
}
|
||||
|
||||
@@ -17,20 +17,26 @@ class TestTokenizer(TokenizerLike):
|
||||
def eos_token_id(self) -> int:
|
||||
return 1
|
||||
|
||||
@property
|
||||
def pad_token_id(self) -> int:
|
||||
return 2
|
||||
|
||||
@property
|
||||
def is_fast(self) -> bool:
|
||||
return True
|
||||
|
||||
|
||||
def test_customized_tokenizer():
|
||||
TokenizerRegistry.register(
|
||||
"test_tokenizer",
|
||||
__name__,
|
||||
TestTokenizer.__name__,
|
||||
)
|
||||
TokenizerRegistry.register("test_tokenizer", __name__, TestTokenizer.__name__)
|
||||
|
||||
tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer")
|
||||
assert isinstance(tokenizer, TestTokenizer)
|
||||
assert tokenizer.bos_token_id == 0
|
||||
assert tokenizer.eos_token_id == 1
|
||||
assert tokenizer.pad_token_id == 2
|
||||
|
||||
tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom")
|
||||
assert isinstance(tokenizer, TestTokenizer)
|
||||
assert tokenizer.bos_token_id == 0
|
||||
assert tokenizer.eos_token_id == 1
|
||||
assert tokenizer.pad_token_id == 2
|
||||
|
||||
Reference in New Issue
Block a user