[Bugfix] Fix guided decoding with tokenizer mode mistral (#11046)

This commit is contained in:
Wallas Henrique
2024-12-18 03:34:08 -03:00
committed by GitHub
parent 866fa4550d
commit 8b79f9e107
7 changed files with 217 additions and 52 deletions

View File

@@ -132,7 +132,7 @@ def get_tokenizer(
if is_from_mistral_org and tokenizer_mode != "mistral":
warnings.warn(
'It is strongly recommended to run mistral models with '
'`--tokenizer_mode "mistral"` to ensure correct '
'`--tokenizer-mode "mistral"` to ensure correct '
'encoding and decoding.',
FutureWarning,
stacklevel=2)

View File

@@ -314,12 +314,15 @@ class MistralTokenizer:
if regular_tokens:
decoded_list.append(
self.decode(regular_tokens)) # type: ignore
self.tokenizer.decode(regular_tokens)) # type: ignore
decoded = ''.join(decoded_list)
return decoded
# WARN: Outlines logits processors can overwrite this method.
# See: guided_decoding/outlines_logits_processors.py::_adapt_tokenizer
# for more.
def decode(self,
ids: Union[List[int], int],
skip_special_tokens: bool = True) -> str: