Patch Mistral Tokenizer (#28146)

Signed-off-by: Julien Denize <julien.denize@mistral.ai>
This commit is contained in:
Julien Denize
2025-11-06 07:43:16 +01:00
committed by GitHub
parent e31946f86e
commit a404e2c0f1
2 changed files with 42 additions and 22 deletions

View File

@@ -334,20 +334,20 @@ class TestMistralTokenizer:
def test_encode(self, mistral_tokenizer: MistralTokenizer):
token_ids = (
[1, 22177, 4304, 2662, 2]
[1, 22177, 4304, 2662]
if mistral_tokenizer.is_tekken
else [1, 23325, 2294, 1686, 2]
else [1, 23325, 2294, 1686]
)
assert mistral_tokenizer.encode("Hello world !") == token_ids[:-1]
assert mistral_tokenizer.encode("Hello world !", max_length=3) == token_ids[:-2]
assert mistral_tokenizer.encode("Hello world !") == token_ids
assert mistral_tokenizer.encode("Hello world !", max_length=3) == token_ids[:-1]
assert (
mistral_tokenizer.encode("Hello world !", truncation=True, max_length=3)
== token_ids[:-2]
== token_ids[:-1]
)
assert (
mistral_tokenizer.encode("Hello world !", truncation=False, max_length=3)
== token_ids[:-1]
== token_ids
)
assert (
@@ -358,7 +358,7 @@ class TestMistralTokenizer:
mistral_tokenizer.encode(
"Hello world !", add_special_tokens=True, max_length=3
)
== token_ids[:-2]
== token_ids[:-1]
)
assert (
mistral_tokenizer.encode(
@@ -368,7 +368,7 @@ class TestMistralTokenizer:
)
assert (
mistral_tokenizer.encode("Hello world !", add_special_tokens=False)
== token_ids[1:-1]
== token_ids[1:]
)
@pytest.mark.parametrize(
@@ -1088,6 +1088,19 @@ class TestMistralTokenizer:
== expected_tokens[mistral_tokenizer.is_tekken]
)
def test_decode_int(
self,
mistral_tokenizer: MistralTokenizer,
):
ids = 1
assert (
mistral_tokenizer.decode(
ids,
skip_special_tokens=False,
)
== "<s>"
)
def test_convert_tokens_to_string(self, mistral_tokenizer: MistralTokenizer):
tokens = (
[