- reference/vllm/tokenizers/ — official DSV4 tokenizer + encoding (read-only) - reference/vllm/reasoning/ — thinking mode parsers (DeepSeekR1 style ) - reference/vllm/tool_parsers/ — DSML tool call parsers (V3.2 base, V4 variant) - reference/official_inference/ — original weight's generate.py, model.py, kernel.py - reference/README.md documents the layout and which files matter for our pipeline - These are read-only references for cross-checking, not imported by production code
97 lines
3.4 KiB
Python
97 lines
3.4 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
import copy
|
|
from typing import Any
|
|
|
|
from transformers import PreTrainedTokenizerFast
|
|
|
|
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
|
|
|
|
from .deepseek_v4_encoding import encode_messages
|
|
from .hf import HfTokenizer, get_cached_tokenizer
|
|
from .protocol import TokenizerLike
|
|
|
|
|
|
def get_deepseek_v4_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer:
|
|
"""
|
|
Wraps a tokenizer to use the custom DeepSeek V4 chat template encoding.
|
|
"""
|
|
dsv4_tokenizer = copy.copy(tokenizer)
|
|
|
|
added_vocab = tokenizer.get_added_vocab()
|
|
added_vocab_size = len(added_vocab)
|
|
tokenizer_vocab_size = tokenizer.vocab_size
|
|
|
|
class _DeepseekV4Tokenizer(tokenizer.__class__): # type: ignore
|
|
def apply_chat_template(
|
|
self,
|
|
messages: list["ChatCompletionMessageParam"],
|
|
tools: list[dict[str, Any]] | None = None,
|
|
**kwargs,
|
|
) -> str | list[int]:
|
|
thinking = kwargs.get("thinking", False)
|
|
enable_thinking = kwargs.get("enable_thinking", False)
|
|
thinking = thinking or enable_thinking
|
|
thinking_mode = "thinking" if thinking else "chat"
|
|
|
|
conversation = kwargs.get("conversation", messages)
|
|
messages = conversation.copy()
|
|
if tools is not None and len(tools) > 0:
|
|
messages.insert(0, {"role": "system"})
|
|
messages[0]["tools"] = tools # type: ignore[typeddict-unknown-key]
|
|
|
|
reasoning_effort = kwargs.get("reasoning_effort")
|
|
if not isinstance(reasoning_effort, str):
|
|
reasoning_effort = None
|
|
elif reasoning_effort == "none":
|
|
thinking_mode = "chat"
|
|
reasoning_effort = None
|
|
elif reasoning_effort in ("max", "xhigh"):
|
|
reasoning_effort = "max"
|
|
else:
|
|
reasoning_effort = "high"
|
|
|
|
encode_config = dict(
|
|
thinking_mode=thinking_mode,
|
|
drop_thinking=kwargs.get("drop_thinking", True),
|
|
reasoning_effort=reasoning_effort,
|
|
)
|
|
|
|
prompt_str = encode_messages(messages, **encode_config) # type: ignore
|
|
|
|
if kwargs.get("tokenize", True):
|
|
tokenizer_kwargs = {
|
|
k: kwargs[k] for k in ("truncation", "max_length") if k in kwargs
|
|
}
|
|
return self.encode(
|
|
prompt_str,
|
|
add_special_tokens=False,
|
|
**tokenizer_kwargs,
|
|
)
|
|
|
|
return prompt_str
|
|
|
|
def num_special_tokens_to_add(self) -> int:
|
|
return len(self.encode(""))
|
|
|
|
def __len__(self) -> int:
|
|
return tokenizer_vocab_size + added_vocab_size
|
|
|
|
def get_added_vocab(self) -> dict[str, int]:
|
|
return added_vocab.copy()
|
|
|
|
def __reduce__(self):
|
|
return get_deepseek_v4_tokenizer, (tokenizer,)
|
|
|
|
_DeepseekV4Tokenizer.__name__ = f"DSV4{tokenizer.__class__.__name__}"
|
|
|
|
dsv4_tokenizer.__class__ = _DeepseekV4Tokenizer
|
|
return dsv4_tokenizer
|
|
|
|
|
|
class DeepseekV4Tokenizer(TokenizerLike):
|
|
@classmethod
|
|
def from_pretrained(cls, *args, **kwargs) -> HfTokenizer:
|
|
tokenizer = PreTrainedTokenizerFast.from_pretrained(*args, **kwargs)
|
|
return get_cached_tokenizer(get_deepseek_v4_tokenizer(tokenizer))
|