more speculative decoding fixes

This commit is contained in:
2026-04-14 05:06:30 +00:00
parent d4813de98f
commit d4568f1d80
3 changed files with 305 additions and 2 deletions

View File

@@ -57,6 +57,7 @@ from vllm.entrypoints.openai.engine.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import (
@@ -171,6 +172,20 @@ class KimiK2ToolParser(ToolParser):
"Successfully initialized %s", self.__class__.__name__
)
# ------------------------------------------------------------------
# Request adjustment
# ------------------------------------------------------------------
def adjust_request(
self, request: ChatCompletionRequest | ResponsesRequest
) -> ChatCompletionRequest | ResponsesRequest:
request = super().adjust_request(request)
if request.tools and request.tool_choice != "none":
# Ensure tool-call tokens (<|tool_calls_section_begin|>,
# <|tool_call_begin|>, etc.) are not stripped during decoding.
request.skip_special_tokens = False
return request
# ------------------------------------------------------------------
# Helpers
# ------------------------------------------------------------------