Add think chunk (#21333)
Some checks failed
Create Release / Create Release (push) Has been cancelled

Signed-off-by: Julien Denize <julien.denize@mistral.ai>
This commit is contained in:
Julien Denize
2025-07-24 06:51:32 +02:00
committed by GitHub
parent 11ef7a611e
commit 6d8d0a24c0
11 changed files with 682 additions and 13 deletions

View File

@@ -145,6 +145,21 @@ def find_tokenizer_file(files: list[str]):
return matched_files[0]
def _aggregate_content(content: list) -> list[dict[str, Any]]:
aggregated_content: list[dict[str, Any]] = []
for chunk in content:
if chunk.get("type"
) == "text" and aggregated_content and aggregated_content[
-1].get("type") == "text":
aggregated_content[-1]["text"] += "\n\n" + chunk.get("text")
else:
aggregated_content.append(chunk)
if len(aggregated_content) == 1 and aggregated_content[0].get(
"type") == "text":
content = aggregated_content[0]["text"]
return content
def make_mistral_chat_completion_request(
messages: list["ChatCompletionMessageParam"],
tools: Optional[list[dict[str,
@@ -162,10 +177,10 @@ def make_mistral_chat_completion_request(
# Convert list text content to string
if message.get("role") in ("assistant", "tool"):
content = message.get("content")
content: Any = message.get("content")
if isinstance(content, list):
content = "\n".join(chunk.get("text") for chunk in content)
message["content"] = content
content = _aggregate_content(content)
message["content"] = content
# The Mistral client, in comparison to the OpenAI client, requires the
# "parameters" dict to be present, even if it's empty.
@@ -465,6 +480,8 @@ class MistralTokenizer(TokenizerBase):
skip_special_tokens: bool = True,
) -> list[str]:
from mistral_common.tokens.tokenizers.base import SpecialTokens
from mistral_common.tokens.tokenizers.instruct import (
InstructTokenizerV13)
# TODO(Patrick) - potentially allow special tokens to not be skipped
assert (
@@ -474,10 +491,18 @@ class MistralTokenizer(TokenizerBase):
assert self.is_tekken or self.is_spm, type(self.tokenizer)
if self.is_tekken:
# skip special tokens except tool call
ids = [
i for i in ids if i > self.tokenizer.num_special_tokens or i ==
# skip special tokens except tool call and think tokens
non_skip_special_tokens = {
self.tokenizer.get_control_token(SpecialTokens.tool_calls)
}
if isinstance(self.instruct, InstructTokenizerV13):
if self.instruct.BEGIN_THINK:
non_skip_special_tokens.add(self.instruct.BEGIN_THINK)
if self.instruct.END_THINK:
non_skip_special_tokens.add(self.instruct.END_THINK)
ids = [
i for i in ids if i > self.tokenizer.num_special_tokens
or i in non_skip_special_tokens
]
tokens = [self.tokenizer.id_to_piece(id) for id in ids]