[Refactor]Reduce duplicate code in serving_chat (#26627)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
This commit is contained in:
Chauncey
2025-10-11 20:04:49 +08:00
committed by GitHub
parent f7ee69868a
commit d0bed837ac
3 changed files with 56 additions and 49 deletions

View File

@@ -6,7 +6,7 @@ import json
import time
from collections.abc import AsyncGenerator, AsyncIterator
from collections.abc import Sequence as GenericSequence
from typing import Callable, Final, Optional, Union
from typing import Final, Optional, Union
import jinja2
import partial_json_parser
@@ -56,14 +56,13 @@ from vllm.entrypoints.openai.protocol import (
)
from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
from vllm.entrypoints.openai.tool_parsers import ToolParser
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall
from vllm.entrypoints.utils import get_max_tokens
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
from vllm.logger import init_logger
from vllm.logprobs import Logprob
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.sampling_params import BeamSearchParams, SamplingParams
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
from vllm.transformers_utils.tokenizers import (
@@ -112,42 +111,15 @@ class OpenAIServingChat(OpenAIServing):
self.trust_request_chat_template = trust_request_chat_template
self.enable_log_outputs = enable_log_outputs
# set up reasoning parser
self.reasoning_parser = self._get_reasoning_parser(
reasoning_parser_name=reasoning_parser
)
# set up tool use
self.enable_auto_tools: bool = enable_auto_tools
if self.enable_auto_tools:
logger.info(
'"auto" tool choice has been enabled please note that while'
" the parallel_tool_calls client option is preset for "
"compatibility reasons, it will be ignored."
)
self.reasoning_parser: Optional[Callable[[AnyTokenizer], ReasoningParser]] = (
None
self.tool_parser = self._get_tool_parser(
tool_parser_name=tool_parser, enable_auto_tools=enable_auto_tools
)
if reasoning_parser:
try:
self.reasoning_parser = ReasoningParserManager.get_reasoning_parser(
reasoning_parser
)
assert self.reasoning_parser is not None
except Exception as e:
raise TypeError(f"{reasoning_parser=} has not been registered") from e
self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None
if self.enable_auto_tools:
try:
if tool_parser == "pythonic" and self.model_config.model.startswith(
"meta-llama/Llama-3.2"
):
logger.warning(
"Llama3.2 models may struggle to emit valid pythonic tool calls"
)
self.tool_parser = ToolParserManager.get_tool_parser(tool_parser)
except Exception as e:
raise TypeError(
"Error: --enable-auto-tool-choice requires "
f"tool_parser:'{tool_parser}' which has not "
"been registered"
) from e
self.exclude_tools_when_tool_choice_none = exclude_tools_when_tool_choice_none
self.enable_prompt_tokens_details = enable_prompt_tokens_details