[Frontend] Skip stop in reasoning content (#14550)

Signed-off-by: Ce Gao <cegao@tensorchord.ai>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
This commit is contained in:
Ce Gao
2025-09-15 14:04:55 +08:00
committed by GitHub
parent 3f3313981c
commit f4a948f33f
3 changed files with 256 additions and 4 deletions

View File

@@ -40,6 +40,7 @@ from vllm.multimodal.cache import processor_only_cache_from_config
from vllm.multimodal.processing import EncDecMultiModalProcessor
from vllm.outputs import (PoolingRequestOutput, RequestOutput,
RequestOutputFactory)
from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.sampling_params import RequestOutputKind, SamplingParams
from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup,
Sequence, SequenceGroup, SequenceGroupBase,
@@ -372,6 +373,14 @@ class LLMEngine:
"vllm.llm_engine",
self.observability_config.otlp_traces_endpoint)
# Initialize reasoning parser if reasoning backend is set.
if self.decoding_config.reasoning_backend and \
self.tokenizer:
reasoner_class = ReasoningParserManager.get_reasoning_parser(
self.decoding_config.reasoning_backend)
self.reasoner: ReasoningParser = reasoner_class(
self.tokenizer.get_lora_tokenizer())
# Create sequence output processor, e.g. for beam search or
# speculative decoding.
self.output_processor = (
@@ -381,8 +390,12 @@ class LLMEngine:
self.scheduler,
self.seq_counter,
get_tokenizer_for_seq,
stop_checker=StopChecker(self.scheduler_config.max_model_len,
get_tokenizer_for_seq),
stop_checker=StopChecker(
self.scheduler_config.max_model_len,
get_tokenizer_for_seq,
self.reasoner if self.decoding_config.reasoning_backend
and self.tokenizer else None,
),
))
self.seq_id_to_seq_group: Dict[str, SequenceGroupBase] = {}