[VLM] Avoid unnecessary tokenization (#12310)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -67,9 +67,10 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
|
||||
tokenizer_all_special_tokens_extended = (
|
||||
tokenizer.all_special_tokens_extended)
|
||||
tokenizer_all_special_tokens = set(tokenizer.all_special_tokens)
|
||||
tokenizer_vocab = tokenizer.get_vocab()
|
||||
tokenizer_len = len(tokenizer)
|
||||
|
||||
max_token_id = max(tokenizer.get_vocab().values())
|
||||
max_token_id = max(tokenizer_vocab.values())
|
||||
# Some tokenizers (e.g., QwenTokenizer) have special tokens that
|
||||
# are added and included in the implementation of the vocab_size
|
||||
# property, but not in get_vocab(); if there is an implementation
|
||||
@@ -96,6 +97,9 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
|
||||
def max_token_id(self):
|
||||
return max_token_id
|
||||
|
||||
def get_vocab(self):
|
||||
return tokenizer_vocab
|
||||
|
||||
def __len__(self):
|
||||
return tokenizer_len
|
||||
|
||||
|
||||
Reference in New Issue
Block a user