[Bugfix] Ensure special tokens are properly filtered out for guided structured output with MistralTokenizer (#10363)

Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
This commit is contained in:
Guillaume Calmettes
2024-11-15 15:50:40 +01:00
committed by GitHub
parent 3a763ba0c3
commit 691a3ec047
2 changed files with 17 additions and 6 deletions

View File

@@ -174,18 +174,29 @@ class MistralTokenizer:
revision=revision)
return tokenizer_file
# the following attributes are set to fit VLLM's design
# the following attributes are set to fit VLLM's design and are used
# by the guided structured output backends.
@property
def all_special_tokens_extended(self) -> List[str]:
return []
# tekken defines its own extended special tokens list
if hasattr(self.tokenizer, "SPECIAL_TOKENS"):
special_tokens = self.tokenizer.SPECIAL_TOKENS
else:
special_tokens = list(SpecialTokens)
return [
s.value if isinstance(s, SpecialTokens) else s
for s in special_tokens
]
@property
def all_special_tokens(self) -> List[str]:
return []
return self.all_special_tokens_extended
@property
def all_special_ids(self) -> List[int]:
return []
return [
self.all_special_tokens.index(t) for t in self.all_special_tokens
]
@property
def bos_token_id(self) -> int: