[Frontend] Skip stop in reasoning content (#14550)
Signed-off-by: Ce Gao <cegao@tensorchord.ai> Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
This commit is contained in:
@@ -40,6 +40,7 @@ from vllm.multimodal.cache import processor_only_cache_from_config
|
||||
from vllm.multimodal.processing import EncDecMultiModalProcessor
|
||||
from vllm.outputs import (PoolingRequestOutput, RequestOutput,
|
||||
RequestOutputFactory)
|
||||
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
||||
from vllm.sampling_params import RequestOutputKind, SamplingParams
|
||||
from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup,
|
||||
Sequence, SequenceGroup, SequenceGroupBase,
|
||||
@@ -372,6 +373,14 @@ class LLMEngine:
|
||||
"vllm.llm_engine",
|
||||
self.observability_config.otlp_traces_endpoint)
|
||||
|
||||
# Initialize reasoning parser if reasoning backend is set.
|
||||
if self.decoding_config.reasoning_backend and \
|
||||
self.tokenizer:
|
||||
reasoner_class = ReasoningParserManager.get_reasoning_parser(
|
||||
self.decoding_config.reasoning_backend)
|
||||
self.reasoner: ReasoningParser = reasoner_class(
|
||||
self.tokenizer.get_lora_tokenizer())
|
||||
|
||||
# Create sequence output processor, e.g. for beam search or
|
||||
# speculative decoding.
|
||||
self.output_processor = (
|
||||
@@ -381,8 +390,12 @@ class LLMEngine:
|
||||
self.scheduler,
|
||||
self.seq_counter,
|
||||
get_tokenizer_for_seq,
|
||||
stop_checker=StopChecker(self.scheduler_config.max_model_len,
|
||||
get_tokenizer_for_seq),
|
||||
stop_checker=StopChecker(
|
||||
self.scheduler_config.max_model_len,
|
||||
get_tokenizer_for_seq,
|
||||
self.reasoner if self.decoding_config.reasoning_backend
|
||||
and self.tokenizer else None,
|
||||
),
|
||||
))
|
||||
|
||||
self.seq_id_to_seq_group: Dict[str, SequenceGroupBase] = {}
|
||||
|
||||
Reference in New Issue
Block a user