[Bugfix] Fix edge-case crash when using chat with the Mistral Tekken Tokenizer (#10051)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
This commit is contained in:
@@ -254,7 +254,7 @@ class MistralTokenizer:
|
||||
skip_special_tokens: bool = True) -> str:
|
||||
assert (
|
||||
skip_special_tokens
|
||||
), "Skipping special tokens is not supported for Mistral tokenizers."
|
||||
), "skip_special_tokens=False is not supported for Mistral tokenizers."
|
||||
|
||||
if isinstance(ids, int):
|
||||
ids = [ids]
|
||||
@@ -268,12 +268,16 @@ class MistralTokenizer:
|
||||
# TODO(Patrick) - potentially allow special tokens to not be skipped
|
||||
assert (
|
||||
skip_special_tokens
|
||||
), "Skipping special tokens is not supported for Mistral tokenizers."
|
||||
), "skip_special_tokens=False is not supported for Mistral tokenizers."
|
||||
|
||||
assert isinstance(self.tokenizer,
|
||||
(Tekkenizer, SentencePieceTokenizer)), type(
|
||||
self.tokenizer)
|
||||
|
||||
if isinstance(self.tokenizer, Tekkenizer):
|
||||
# skip special tokens
|
||||
ids = [i for i in ids if i > self.tokenizer.num_special_tokens]
|
||||
|
||||
tokens = [self.tokenizer.id_to_piece(id) for id in ids]
|
||||
|
||||
if any("<EFBFBD>" in t for t in tokens):
|
||||
|
||||
Reference in New Issue
Block a user