[Misc] Make cached tokenizer pickle-compatible (#17048)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-27 13:05:00 +08:00
parent 8e4b351a0c
commit 93a126fbc7
5 changed files with 81 additions and 57 deletions
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -257,7 +257,7 @@ class MistralTokenizer(TokenizerBase):
    # the following attributes are set to fit vLLM's design and are used
    # by the guided structured output backends.
    @property
-    def all_special_tokens_extended(self) -> List[str]:
+    def all_special_tokens_extended(self) -> list[str]:
        from mistral_common.tokens.tokenizers.base import SpecialTokens

        # tekken defines its own extended special tokens list
@@ -271,11 +271,11 @@ class MistralTokenizer(TokenizerBase):
        ]

    @property
-    def all_special_tokens(self) -> List[str]:
+    def all_special_tokens(self) -> list[str]:
        return self.all_special_tokens_extended

    @property
-    def all_special_ids(self) -> List[int]:
+    def all_special_ids(self) -> list[int]:
        return [
            self.all_special_tokens.index(t) for t in self.all_special_tokens
        ]
@@ -335,12 +335,12 @@ class MistralTokenizer(TokenizerBase):
            input_ids = self.encode_one(text, truncation, max_length)
        return Encoding(input_ids=input_ids)

-    def get_vocab(self) -> Dict[str, int]:
+    def get_vocab(self) -> dict[str, int]:
        # NB: the dictionary form of the vocabulary collapses token ids that map
        # to the same string but have different bytes
        return self._vocab_dict

-    def get_added_vocab(self) -> Dict[str, int]:
+    def get_added_vocab(self) -> dict[str, int]:
        # Mistral tokenizers have no added vocabulary
        return {}