diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenization/test_cached_tokenizer.py index 074039f9e..a5bb3dbcf 100644 --- a/tests/tokenization/test_cached_tokenizer.py +++ b/tests/tokenization/test_cached_tokenizer.py @@ -31,7 +31,6 @@ def _check_consistency(target: AnyTokenizer, expected: AnyTokenizer): # Cached attributes assert target.all_special_ids == expected.all_special_ids assert target.all_special_tokens == expected.all_special_tokens - assert target.all_special_tokens_extended == expected.all_special_tokens_extended assert target.get_vocab() == expected.get_vocab() assert len(target) == len(expected) diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenization/test_mistral_tokenizer.py index c80b698ba..4cdfa9df9 100644 --- a/tests/tokenization/test_mistral_tokenizer.py +++ b/tests/tokenization/test_mistral_tokenizer.py @@ -258,52 +258,46 @@ def mistral_tokenizer(request) -> MistralTokenizer: ) class TestMistralTokenizer: def test_all_special_tokens(self, mistral_tokenizer: MistralTokenizer): - attributes = [ - mistral_tokenizer.all_special_tokens, - mistral_tokenizer.all_special_tokens_extended, - ] - - for attribute in attributes: - if mistral_tokenizer.is_tekken: - assert attribute == [ - "", - "", - "", - "[INST]", - "[/INST]", - "[AVAILABLE_TOOLS]", - "[/AVAILABLE_TOOLS]", - "[TOOL_RESULTS]", - "[/TOOL_RESULTS]", - "[TOOL_CALLS]", - "[IMG]", - "", - "[IMG_BREAK]", - "[IMG_END]", - "[PREFIX]", - "[MIDDLE]", - "[SUFFIX]", - "[SYSTEM_PROMPT]", - "[/SYSTEM_PROMPT]", - "[TOOL_CONTENT]", - ] + [f"" for i in range(20, 32)] + [ - "[ARGS]", - "[CALL_ID]", - "[THINK]", - "[/THINK]", - ] + [f"" for i in range(36, 1000)] - else: - assert attribute == [ - "", - "", - "[INST]", - "[/INST]", - "[TOOL_CALLS]", - "[AVAILABLE_TOOLS]", - "[/AVAILABLE_TOOLS]", - "[TOOL_RESULTS]", - "[/TOOL_RESULTS]", - ] + [f"[control_{i}]" for i in range(8, 769)] + if mistral_tokenizer.is_tekken: + assert mistral_tokenizer.all_special_tokens == [ + "", + "", + "", + "[INST]", + "[/INST]", + "[AVAILABLE_TOOLS]", + "[/AVAILABLE_TOOLS]", + "[TOOL_RESULTS]", + "[/TOOL_RESULTS]", + "[TOOL_CALLS]", + "[IMG]", + "", + "[IMG_BREAK]", + "[IMG_END]", + "[PREFIX]", + "[MIDDLE]", + "[SUFFIX]", + "[SYSTEM_PROMPT]", + "[/SYSTEM_PROMPT]", + "[TOOL_CONTENT]", + ] + [f"" for i in range(20, 32)] + [ + "[ARGS]", + "[CALL_ID]", + "[THINK]", + "[/THINK]", + ] + [f"" for i in range(36, 1000)] + else: + assert mistral_tokenizer.all_special_tokens == [ + "", + "", + "[INST]", + "[/INST]", + "[TOOL_CALLS]", + "[AVAILABLE_TOOLS]", + "[/AVAILABLE_TOOLS]", + "[TOOL_RESULTS]", + "[/TOOL_RESULTS]", + ] + [f"[control_{i}]" for i in range(8, 769)] def get_vocab(self, mistral_tokenizer: MistralTokenizer): assert ( diff --git a/tests/tokenization/test_tokenizer_registry.py b/tests/tokenization/test_tokenizer_registry.py index d89737888..f13bb4333 100644 --- a/tests/tokenization/test_tokenizer_registry.py +++ b/tests/tokenization/test_tokenizer_registry.py @@ -15,10 +15,6 @@ class TestTokenizer(TokenizerBase): def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer": return TestTokenizer() - @property - def all_special_tokens_extended(self) -> list[str]: - raise NotImplementedError() - @property def all_special_tokens(self) -> list[str]: raise NotImplementedError() diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 9eb7fe379..be4325ab9 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -96,7 +96,6 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer: tokenizer_all_special_ids = tokenizer.all_special_ids tokenizer_all_special_tokens = tokenizer.all_special_tokens - tokenizer_all_special_tokens_extended = tokenizer.all_special_tokens_extended tokenizer_vocab = tokenizer.get_vocab() tokenizer_len = len(tokenizer) @@ -118,10 +117,6 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer: def all_special_tokens(self) -> list[str]: return tokenizer_all_special_tokens - @property - def all_special_tokens_extended(self) -> list[str]: - return tokenizer_all_special_tokens_extended - @property def max_token_id(self) -> int: return max_token_id diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py index 7421eb534..52f221d1e 100644 --- a/vllm/transformers_utils/tokenizer_base.py +++ b/vllm/transformers_utils/tokenizer_base.py @@ -10,11 +10,6 @@ if TYPE_CHECKING: class TokenizerBase(ABC): - @property - @abstractmethod - def all_special_tokens_extended(self) -> list[str]: - raise NotImplementedError() - @property @abstractmethod def all_special_tokens(self) -> list[str]: diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index caff43c55..1954e2a81 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -254,10 +254,6 @@ class MistralTokenizer(TokenizerBase): # the following attributes are set to fit vLLM's design and are used # by the structured output backends. - @property - def all_special_tokens_extended(self) -> list[str]: - return self.all_special_tokens - @property def all_special_tokens(self) -> list[str]: return self._special_tokens