diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index cea117541..e9aa87a69 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -505,7 +505,7 @@ Here is a summary of a plugin file: # adjust request. e.g.: set skip special tokens # to False for tool call output. - def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: + def adjust_request(self, request: ChatCompletionRequest | ResponsesRequest) -> ChatCompletionRequest | ResponsesRequest: return request # implement the tool call parse for stream call diff --git a/vllm/entrypoints/serve/render/serving.py b/vllm/entrypoints/serve/render/serving.py index 52f03447d..83b41bbda 100644 --- a/vllm/entrypoints/serve/render/serving.py +++ b/vllm/entrypoints/serve/render/serving.py @@ -546,7 +546,7 @@ class OpenAIServingRender: raise NotImplementedError(msg) tokenizer = renderer.get_tokenizer() request = tool_parser(tokenizer, request.tools).adjust_request( - request=request # type: ignore[arg-type] + request=request ) return conversation, [engine_input] diff --git a/vllm/parser/abstract_parser.py b/vllm/parser/abstract_parser.py index dd9dc9423..8a12f6fe1 100644 --- a/vllm/parser/abstract_parser.py +++ b/vllm/parser/abstract_parser.py @@ -32,9 +32,7 @@ from vllm.entrypoints.openai.engine.protocol import ( FunctionCall, FunctionDefinition, ) -from vllm.entrypoints.openai.responses.protocol import ( - ResponsesRequest, -) +from vllm.entrypoints.openai.responses.protocol import ResponsesRequest from vllm.logger import init_logger from vllm.reasoning.abs_reasoning_parsers import ReasoningParser from vllm.tokenizers import TokenizerLike @@ -229,7 +227,9 @@ class Parser: # ========== Tool Parser Methods ========== - def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: + def adjust_request( + self, request: ChatCompletionRequest | ResponsesRequest + ) -> ChatCompletionRequest | ResponsesRequest: """ Adjust the request parameters for tool calling. diff --git a/vllm/tool_parsers/abstract_tool_parser.py b/vllm/tool_parsers/abstract_tool_parser.py index 3f578b1b6..9c05bb6b9 100644 --- a/vllm/tool_parsers/abstract_tool_parser.py +++ b/vllm/tool_parsers/abstract_tool_parser.py @@ -62,7 +62,9 @@ class ToolParser: # whereas all tokenizers have .get_vocab() return self.model_tokenizer.get_vocab() - def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: + def adjust_request( + self, request: ChatCompletionRequest | ResponsesRequest + ) -> ChatCompletionRequest | ResponsesRequest: """ Static method that used to adjust the request parameters. """ diff --git a/vllm/tool_parsers/deepseekv32_tool_parser.py b/vllm/tool_parsers/deepseekv32_tool_parser.py index a9772b753..f30231009 100644 --- a/vllm/tool_parsers/deepseekv32_tool_parser.py +++ b/vllm/tool_parsers/deepseekv32_tool_parser.py @@ -19,6 +19,7 @@ from vllm.entrypoints.openai.engine.protocol import ( FunctionCall, ToolCall, ) +from vllm.entrypoints.openai.responses.protocol import ResponsesRequest from vllm.logger import init_logger from vllm.tokenizers import TokenizerLike from vllm.tool_parsers.abstract_tool_parser import ( @@ -78,7 +79,9 @@ class DeepSeekV32ToolParser(ToolParser): "vLLM Successfully import tool parser %s !", self.__class__.__name__ ) - def adjust_request(self, request): + def adjust_request( + self, request: ChatCompletionRequest | ResponsesRequest + ) -> ChatCompletionRequest | ResponsesRequest: request = super().adjust_request(request) if request.tools and request.tool_choice != "none": # Ensure tool call tokens diff --git a/vllm/tool_parsers/functiongemma_tool_parser.py b/vllm/tool_parsers/functiongemma_tool_parser.py index dfd91d974..35c4c6b84 100644 --- a/vllm/tool_parsers/functiongemma_tool_parser.py +++ b/vllm/tool_parsers/functiongemma_tool_parser.py @@ -18,6 +18,7 @@ from vllm.entrypoints.openai.engine.protocol import ( FunctionCall, ToolCall, ) +from vllm.entrypoints.openai.responses.protocol import ResponsesRequest from vllm.logger import init_logger from vllm.tokenizers import TokenizerLike from vllm.tool_parsers.abstract_tool_parser import Tool, ToolParser @@ -86,7 +87,9 @@ class FunctionGemmaToolParser(ToolParser): return arguments - def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: + def adjust_request( + self, request: ChatCompletionRequest | ResponsesRequest + ) -> ChatCompletionRequest | ResponsesRequest: request = super().adjust_request(request) if request.tools and request.tool_choice != "none": request.skip_special_tokens = False diff --git a/vllm/tool_parsers/gigachat3_tool_parser.py b/vllm/tool_parsers/gigachat3_tool_parser.py index f470f6a5b..d48ff43ee 100644 --- a/vllm/tool_parsers/gigachat3_tool_parser.py +++ b/vllm/tool_parsers/gigachat3_tool_parser.py @@ -18,6 +18,7 @@ from vllm.entrypoints.openai.engine.protocol import ( FunctionCall, ToolCall, ) +from vllm.entrypoints.openai.responses.protocol import ResponsesRequest from vllm.logger import init_logger from vllm.tokenizers import TokenizerLike from vllm.tool_parsers.abstract_tool_parser import Tool, ToolParser @@ -55,7 +56,9 @@ class GigaChat3ToolParser(ToolParser): self.end_content: bool = False self.streamed_args_for_tool: list[str] = [] - def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: + def adjust_request( + self, request: ChatCompletionRequest | ResponsesRequest + ) -> ChatCompletionRequest | ResponsesRequest: request = super().adjust_request(request) if request.tools and request.tool_choice != "none": request.skip_special_tokens = False diff --git a/vllm/tool_parsers/glm4_moe_tool_parser.py b/vllm/tool_parsers/glm4_moe_tool_parser.py index fc718921d..d1778c565 100644 --- a/vllm/tool_parsers/glm4_moe_tool_parser.py +++ b/vllm/tool_parsers/glm4_moe_tool_parser.py @@ -30,6 +30,7 @@ from vllm.entrypoints.openai.engine.protocol import ( FunctionCall, ToolCall, ) +from vllm.entrypoints.openai.responses.protocol import ResponsesRequest from vllm.logger import init_logger from vllm.tokenizers import TokenizerLike from vllm.tool_parsers.abstract_tool_parser import ( @@ -151,7 +152,9 @@ class Glm4MoeModelToolParser(ToolParser): logger.exception("Failed to determine if tools are enabled.") return False - def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: + def adjust_request( + self, request: ChatCompletionRequest | ResponsesRequest + ) -> ChatCompletionRequest | ResponsesRequest: """Adjust request parameters for tool call token handling.""" request = super().adjust_request(request) if request.tools and request.tool_choice != "none": diff --git a/vllm/tool_parsers/granite4_tool_parser.py b/vllm/tool_parsers/granite4_tool_parser.py index 3d58690f5..b11e9e6d5 100644 --- a/vllm/tool_parsers/granite4_tool_parser.py +++ b/vllm/tool_parsers/granite4_tool_parser.py @@ -19,6 +19,7 @@ from vllm.entrypoints.openai.engine.protocol import ( FunctionCall, ToolCall, ) +from vllm.entrypoints.openai.responses.protocol import ResponsesRequest from vllm.logger import init_logger from vllm.tokenizers import TokenizerLike from vllm.tool_parsers.abstract_tool_parser import ( @@ -59,7 +60,9 @@ class Granite4ToolParser(ToolParser): self.start_regex = re.compile(self.tc_start) self.end_regex = re.compile(self.tc_end) - def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: + def adjust_request( + self, request: ChatCompletionRequest | ResponsesRequest + ) -> ChatCompletionRequest | ResponsesRequest: request = super().adjust_request(request) if request.tools and request.tool_choice != "none": # do not skip special tokens because the tool_call tokens are diff --git a/vllm/tool_parsers/hermes_tool_parser.py b/vllm/tool_parsers/hermes_tool_parser.py index 4e54d75b4..469bc8363 100644 --- a/vllm/tool_parsers/hermes_tool_parser.py +++ b/vllm/tool_parsers/hermes_tool_parser.py @@ -18,6 +18,7 @@ from vllm.entrypoints.openai.engine.protocol import ( FunctionCall, ToolCall, ) +from vllm.entrypoints.openai.responses.protocol import ResponsesRequest from vllm.logger import init_logger from vllm.tokenizers import TokenizerLike from vllm.tool_parsers.abstract_tool_parser import ( @@ -77,7 +78,9 @@ class Hermes2ProToolParser(ToolParser): # Streaming state: what has been sent to the client. self._sent_content_idx: int = 0 - def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: + def adjust_request( + self, request: ChatCompletionRequest | ResponsesRequest + ) -> ChatCompletionRequest | ResponsesRequest: request = super().adjust_request(request) if request.tools and request.tool_choice != "none": # do not skip special tokens because the tool_call tokens are diff --git a/vllm/tool_parsers/internlm2_tool_parser.py b/vllm/tool_parsers/internlm2_tool_parser.py index fc7c44cff..6f38b851a 100644 --- a/vllm/tool_parsers/internlm2_tool_parser.py +++ b/vllm/tool_parsers/internlm2_tool_parser.py @@ -19,6 +19,7 @@ from vllm.entrypoints.openai.engine.protocol import ( FunctionCall, ToolCall, ) +from vllm.entrypoints.openai.responses.protocol import ResponsesRequest from vllm.logger import init_logger from vllm.tokenizers import TokenizerLike from vllm.tool_parsers.abstract_tool_parser import ( @@ -35,7 +36,9 @@ class Internlm2ToolParser(ToolParser): super().__init__(tokenizer, tools) self.position = 0 - def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: + def adjust_request( + self, request: ChatCompletionRequest | ResponsesRequest + ) -> ChatCompletionRequest | ResponsesRequest: request = super().adjust_request(request) if request.tools and request.tool_choice != "none": # do not skip special tokens because internlm use the special diff --git a/vllm/tool_parsers/jamba_tool_parser.py b/vllm/tool_parsers/jamba_tool_parser.py index 5a9af9910..dec3c88d9 100644 --- a/vllm/tool_parsers/jamba_tool_parser.py +++ b/vllm/tool_parsers/jamba_tool_parser.py @@ -20,6 +20,7 @@ from vllm.entrypoints.openai.engine.protocol import ( FunctionCall, ToolCall, ) +from vllm.entrypoints.openai.responses.protocol import ResponsesRequest from vllm.logger import init_logger from vllm.tokenizers import TokenizerLike from vllm.tool_parsers.abstract_tool_parser import Tool, ToolParser @@ -68,7 +69,9 @@ class JambaToolParser(ToolParser): "tokens in the tokenizer!" ) - def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: + def adjust_request( + self, request: ChatCompletionRequest | ResponsesRequest + ) -> ChatCompletionRequest | ResponsesRequest: request = super().adjust_request(request) if request.tools and request.tool_choice != "none": # do not skip special tokens because jamba use the special diff --git a/vllm/tool_parsers/mistral_tool_parser.py b/vllm/tool_parsers/mistral_tool_parser.py index 153c6ed32..dc92522a0 100644 --- a/vllm/tool_parsers/mistral_tool_parser.py +++ b/vllm/tool_parsers/mistral_tool_parser.py @@ -23,6 +23,7 @@ from vllm.entrypoints.openai.engine.protocol import ( FunctionCall, ToolCall, ) +from vllm.entrypoints.openai.responses.protocol import ResponsesRequest from vllm.logger import init_logger from vllm.tokenizers import TokenizerLike from vllm.tool_parsers.abstract_tool_parser import ( @@ -111,7 +112,9 @@ class MistralToolParser(ToolParser): "the tokenizer!" ) - def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: + def adjust_request( + self, request: ChatCompletionRequest | ResponsesRequest + ) -> ChatCompletionRequest | ResponsesRequest: request = super().adjust_request(request) if ( not is_mistral_tokenizer(self.model_tokenizer) diff --git a/vllm/tool_parsers/step3_tool_parser.py b/vllm/tool_parsers/step3_tool_parser.py index a9c569587..06e7a6466 100644 --- a/vllm/tool_parsers/step3_tool_parser.py +++ b/vllm/tool_parsers/step3_tool_parser.py @@ -19,6 +19,7 @@ from vllm.entrypoints.openai.engine.protocol import ( FunctionCall, ToolCall, ) +from vllm.entrypoints.openai.responses.protocol import ResponsesRequest from vllm.logger import init_logger from vllm.tokenizers import TokenizerLike from vllm.tool_parsers.abstract_tool_parser import ( @@ -51,7 +52,9 @@ class Step3ToolParser(ToolParser): self.tool_block_started = False self.tool_block_finished = False - def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: + def adjust_request( + self, request: ChatCompletionRequest | ResponsesRequest + ) -> ChatCompletionRequest | ResponsesRequest: request = super().adjust_request(request) if request.tools and request.tool_choice != "none": request.skip_special_tokens = False