Remove all_special_tokens_extended from tokenizer code (#29686)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-11-28 20:26:51 +00:00
committed by GitHub
parent 8d9338fae4
commit fecae12cd7
6 changed files with 40 additions and 65 deletions

View File

@@ -258,52 +258,46 @@ def mistral_tokenizer(request) -> MistralTokenizer:
)
class TestMistralTokenizer:
def test_all_special_tokens(self, mistral_tokenizer: MistralTokenizer):
attributes = [
mistral_tokenizer.all_special_tokens,
mistral_tokenizer.all_special_tokens_extended,
]
for attribute in attributes:
if mistral_tokenizer.is_tekken:
assert attribute == [
"<unk>",
"<s>",
"</s>",
"[INST]",
"[/INST]",
"[AVAILABLE_TOOLS]",
"[/AVAILABLE_TOOLS]",
"[TOOL_RESULTS]",
"[/TOOL_RESULTS]",
"[TOOL_CALLS]",
"[IMG]",
"<pad>",
"[IMG_BREAK]",
"[IMG_END]",
"[PREFIX]",
"[MIDDLE]",
"[SUFFIX]",
"[SYSTEM_PROMPT]",
"[/SYSTEM_PROMPT]",
"[TOOL_CONTENT]",
] + [f"<SPECIAL_{i}>" for i in range(20, 32)] + [
"[ARGS]",
"[CALL_ID]",
"[THINK]",
"[/THINK]",
] + [f"<SPECIAL_{i}>" for i in range(36, 1000)]
else:
assert attribute == [
"<s>",
"</s>",
"[INST]",
"[/INST]",
"[TOOL_CALLS]",
"[AVAILABLE_TOOLS]",
"[/AVAILABLE_TOOLS]",
"[TOOL_RESULTS]",
"[/TOOL_RESULTS]",
] + [f"[control_{i}]" for i in range(8, 769)]
if mistral_tokenizer.is_tekken:
assert mistral_tokenizer.all_special_tokens == [
"<unk>",
"<s>",
"</s>",
"[INST]",
"[/INST]",
"[AVAILABLE_TOOLS]",
"[/AVAILABLE_TOOLS]",
"[TOOL_RESULTS]",
"[/TOOL_RESULTS]",
"[TOOL_CALLS]",
"[IMG]",
"<pad>",
"[IMG_BREAK]",
"[IMG_END]",
"[PREFIX]",
"[MIDDLE]",
"[SUFFIX]",
"[SYSTEM_PROMPT]",
"[/SYSTEM_PROMPT]",
"[TOOL_CONTENT]",
] + [f"<SPECIAL_{i}>" for i in range(20, 32)] + [
"[ARGS]",
"[CALL_ID]",
"[THINK]",
"[/THINK]",
] + [f"<SPECIAL_{i}>" for i in range(36, 1000)]
else:
assert mistral_tokenizer.all_special_tokens == [
"<s>",
"</s>",
"[INST]",
"[/INST]",
"[TOOL_CALLS]",
"[AVAILABLE_TOOLS]",
"[/AVAILABLE_TOOLS]",
"[TOOL_RESULTS]",
"[/TOOL_RESULTS]",
] + [f"[control_{i}]" for i in range(8, 769)]
def get_vocab(self, mistral_tokenizer: MistralTokenizer):
assert (