[Refactor]Reduce duplicate code in serving_chat (#26627)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
This commit is contained in:
@@ -6,7 +6,7 @@ import json
|
||||
import time
|
||||
from collections.abc import AsyncGenerator, AsyncIterator
|
||||
from collections.abc import Sequence as GenericSequence
|
||||
from typing import Callable, Final, Optional, Union
|
||||
from typing import Final, Optional, Union
|
||||
|
||||
import jinja2
|
||||
import partial_json_parser
|
||||
@@ -56,14 +56,13 @@ from vllm.entrypoints.openai.protocol import (
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
|
||||
from vllm.entrypoints.openai.tool_parsers import ToolParser
|
||||
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall
|
||||
from vllm.entrypoints.utils import get_max_tokens
|
||||
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
|
||||
from vllm.logger import init_logger
|
||||
from vllm.logprobs import Logprob
|
||||
from vllm.outputs import CompletionOutput, RequestOutput
|
||||
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
||||
from vllm.sampling_params import BeamSearchParams, SamplingParams
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
||||
from vllm.transformers_utils.tokenizers import (
|
||||
@@ -112,42 +111,15 @@ class OpenAIServingChat(OpenAIServing):
|
||||
self.trust_request_chat_template = trust_request_chat_template
|
||||
self.enable_log_outputs = enable_log_outputs
|
||||
|
||||
# set up reasoning parser
|
||||
self.reasoning_parser = self._get_reasoning_parser(
|
||||
reasoning_parser_name=reasoning_parser
|
||||
)
|
||||
# set up tool use
|
||||
self.enable_auto_tools: bool = enable_auto_tools
|
||||
if self.enable_auto_tools:
|
||||
logger.info(
|
||||
'"auto" tool choice has been enabled please note that while'
|
||||
" the parallel_tool_calls client option is preset for "
|
||||
"compatibility reasons, it will be ignored."
|
||||
)
|
||||
|
||||
self.reasoning_parser: Optional[Callable[[AnyTokenizer], ReasoningParser]] = (
|
||||
None
|
||||
self.tool_parser = self._get_tool_parser(
|
||||
tool_parser_name=tool_parser, enable_auto_tools=enable_auto_tools
|
||||
)
|
||||
if reasoning_parser:
|
||||
try:
|
||||
self.reasoning_parser = ReasoningParserManager.get_reasoning_parser(
|
||||
reasoning_parser
|
||||
)
|
||||
assert self.reasoning_parser is not None
|
||||
except Exception as e:
|
||||
raise TypeError(f"{reasoning_parser=} has not been registered") from e
|
||||
self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None
|
||||
if self.enable_auto_tools:
|
||||
try:
|
||||
if tool_parser == "pythonic" and self.model_config.model.startswith(
|
||||
"meta-llama/Llama-3.2"
|
||||
):
|
||||
logger.warning(
|
||||
"Llama3.2 models may struggle to emit valid pythonic tool calls"
|
||||
)
|
||||
self.tool_parser = ToolParserManager.get_tool_parser(tool_parser)
|
||||
except Exception as e:
|
||||
raise TypeError(
|
||||
"Error: --enable-auto-tool-choice requires "
|
||||
f"tool_parser:'{tool_parser}' which has not "
|
||||
"been registered"
|
||||
) from e
|
||||
self.exclude_tools_when_tool_choice_none = exclude_tools_when_tool_choice_none
|
||||
|
||||
self.enable_prompt_tokens_details = enable_prompt_tokens_details
|
||||
|
||||
Reference in New Issue
Block a user