[Misc] Update TokenizerLike interface and move get_cached_tokenizer (#29730)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, cast
|
||||
|
||||
from vllm.logger import init_logger
|
||||
@@ -12,6 +12,7 @@ if TYPE_CHECKING:
|
||||
ChatCompletionRequest as MistralChatCompletionRequest,
|
||||
)
|
||||
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
|
||||
from transformers import BatchEncoding
|
||||
from transformers.tokenization_mistral_common import (
|
||||
MistralCommonTokenizer as TransformersMistralTokenizer,
|
||||
)
|
||||
@@ -165,7 +166,35 @@ def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
|
||||
|
||||
|
||||
class MistralTokenizer(TokenizerLike):
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls,
|
||||
path_or_repo_id: str | Path,
|
||||
*args,
|
||||
trust_remote_code: bool = False,
|
||||
revision: str | None = None,
|
||||
download_dir: str | None = None,
|
||||
**kwargs,
|
||||
) -> "MistralTokenizer":
|
||||
from mistral_common.protocol.instruct.validator import ValidationMode
|
||||
from transformers.tokenization_mistral_common import (
|
||||
MistralCommonTokenizer as TransformersMistralTokenizer,
|
||||
)
|
||||
|
||||
tokenizer = TransformersMistralTokenizer.from_pretrained(
|
||||
path_or_repo_id,
|
||||
*args,
|
||||
mode=ValidationMode.test,
|
||||
cache_dir=download_dir,
|
||||
revision="main" if revision is None else revision,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
return cls(tokenizer)
|
||||
|
||||
def __init__(self, tokenizer: "TransformersMistralTokenizer") -> None:
|
||||
super().__init__()
|
||||
|
||||
from mistral_common.protocol.instruct.validator import ValidationMode
|
||||
from mistral_common.tokens.tokenizers.sentencepiece import (
|
||||
SentencePieceTokenizer,
|
||||
@@ -211,22 +240,6 @@ class MistralTokenizer(TokenizerLike):
|
||||
self._vocab = self.tokenizer._vocab
|
||||
self._max_token_id = self.vocab_size - 1
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls, path_or_repo_id: str, *, revision: str | None = None
|
||||
) -> "MistralTokenizer":
|
||||
from mistral_common.protocol.instruct.validator import ValidationMode
|
||||
from transformers.tokenization_mistral_common import (
|
||||
MistralCommonTokenizer as TransformersMistralTokenizer,
|
||||
)
|
||||
|
||||
str_revision = "main" if revision is None else revision
|
||||
return cls(
|
||||
TransformersMistralTokenizer.from_pretrained(
|
||||
path_or_repo_id, revision=str_revision, mode=ValidationMode.test
|
||||
)
|
||||
)
|
||||
|
||||
def _get_special_token_ids(self) -> list[int]:
|
||||
from mistral_common.tokens.tokenizers.sentencepiece import (
|
||||
SentencePieceTokenizer,
|
||||
@@ -271,6 +284,10 @@ class MistralTokenizer(TokenizerLike):
|
||||
def eos_token_id(self) -> int:
|
||||
return self.tokenizer.eos_id
|
||||
|
||||
@property
|
||||
def pad_token_id(self) -> int:
|
||||
return self.tokenizer.pad_id
|
||||
|
||||
@property
|
||||
def is_fast(self) -> bool:
|
||||
return True
|
||||
@@ -298,12 +315,12 @@ class MistralTokenizer(TokenizerLike):
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: str | list[str] | list[int],
|
||||
text: str | list[str],
|
||||
text_pair: str | None = None,
|
||||
add_special_tokens: bool = False,
|
||||
add_special_tokens: bool = True,
|
||||
truncation: bool = False,
|
||||
max_length: int | None = None,
|
||||
):
|
||||
) -> "BatchEncoding":
|
||||
if text_pair is not None:
|
||||
raise ValueError(
|
||||
"`text_pair` is not supported by `MistralTokenizer.__call__`."
|
||||
@@ -342,13 +359,11 @@ class MistralTokenizer(TokenizerLike):
|
||||
text: str,
|
||||
truncation: bool | None = None,
|
||||
max_length: int | None = None,
|
||||
add_special_tokens: bool | None = None,
|
||||
add_special_tokens: bool = True,
|
||||
) -> list[int]:
|
||||
# TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
|
||||
# is in, directly call self.transformers_tokenizer.encode(...).
|
||||
encoded = self.tokenizer.encode(
|
||||
text, bos=add_special_tokens is not False, eos=False
|
||||
)
|
||||
encoded = self.tokenizer.encode(text, bos=add_special_tokens, eos=False)
|
||||
|
||||
if truncation is not False and max_length is not None:
|
||||
return encoded[:max_length]
|
||||
@@ -383,7 +398,7 @@ class MistralTokenizer(TokenizerLike):
|
||||
return_dict=False,
|
||||
)
|
||||
|
||||
def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
|
||||
def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str:
|
||||
# TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
|
||||
# is in, directly call self.transformers_tokenizer.decode(...).
|
||||
if isinstance(ids, int):
|
||||
@@ -455,7 +470,7 @@ class MistralTokenizer(TokenizerLike):
|
||||
def convert_ids_to_tokens(
|
||||
self,
|
||||
ids: list[int],
|
||||
skip_special_tokens: bool = True,
|
||||
skip_special_tokens: bool = False,
|
||||
) -> list[str]:
|
||||
from mistral_common.tokens.tokenizers.base import (
|
||||
SpecialTokenPolicy,
|
||||
|
||||
Reference in New Issue
Block a user