Add think chunk (#21333)
Some checks failed
Create Release / Create Release (push) Has been cancelled

Signed-off-by: Julien Denize <julien.denize@mistral.ai>
This commit is contained in:
Julien Denize
2025-07-24 06:51:32 +02:00
committed by GitHub
parent 11ef7a611e
commit 6d8d0a24c0
11 changed files with 682 additions and 13 deletions

View File

@@ -6,6 +6,7 @@ from typing import Optional, Union
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
DeltaMessage)
from vllm.reasoning import ReasoningParser
from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
class StreamingReasoningReconstructor:
@@ -54,6 +55,32 @@ def run_reasoning_extraction(
return reasoning, content
def run_reasoning_extraction_mistral(
reasoning_parser: ReasoningParser,
model_output: list[int],
request: Union[ChatCompletionRequest, None] = None,
streaming: bool = False,
) -> tuple[Optional[str], Optional[str]]:
assert isinstance(reasoning_parser.model_tokenizer,
MistralTokenizer), type(reasoning_parser.model_tokenizer)
if streaming:
reconstructor = run_reasoning_extraction_streaming_mistral(
reasoning_parser,
model_output,
request,
)
return (
reconstructor.reasoning_content,
reconstructor.other_content or None,
)
else:
str_output = reasoning_parser.model_tokenizer.convert_ids_to_tokens(
model_output)
reasoning, content = run_reasoning_extraction_nonstreaming(
reasoning_parser, str_output, request)
return reasoning, content
def run_reasoning_extraction_nonstreaming(
reasoning_parser: ReasoningParser,
model_output: list[str],
@@ -94,3 +121,35 @@ def run_reasoning_extraction_streaming(
previous_text = current_text
previous_tokens = current_tokens
return reconstructor
def run_reasoning_extraction_streaming_mistral(
reasoning_parser: ReasoningParser,
model_deltas: list[int],
request: Union[ChatCompletionRequest, None] = None,
) -> StreamingReasoningReconstructor:
assert isinstance(reasoning_parser.model_tokenizer,
MistralTokenizer), type(reasoning_parser.model_tokenizer)
request = request or ChatCompletionRequest(messages=[], model="test-model")
reconstructor = StreamingReasoningReconstructor()
previous_text = ""
previous_tokens: list[int] = []
for model_delta in model_deltas:
token_delta = [model_delta]
delta = reasoning_parser.model_tokenizer.convert_ids_to_tokens(
[model_delta])[0]
current_text = previous_text + delta
current_tokens = previous_tokens + token_delta
delta_message = reasoning_parser.extract_reasoning_content_streaming(
previous_text,
current_text,
delta,
previous_tokens,
current_tokens,
token_delta,
)
if delta_message is not None:
reconstructor.append_delta(delta_message)
previous_text = current_text
previous_tokens = current_tokens
return reconstructor