[Bugfix] The special_tokens in tokenizer should also be controlled by do_lower_case in encoder_config. (#20750)

Signed-off-by: wang.yuqi <noooop@126.com>
This commit is contained in:
wang.yuqi
2025-07-18 17:10:47 +08:00
committed by GitHub
parent ca4eb82bcb
commit 5895afd780
2 changed files with 32 additions and 0 deletions

View File

@@ -16,6 +16,8 @@ from transformers import (AutoTokenizer, PreTrainedTokenizer,
from vllm import envs
from vllm.logger import init_logger
from vllm.transformers_utils.config import (
get_sentence_transformer_tokenizer_config)
from vllm.transformers_utils.tokenizers import MistralTokenizer
from vllm.transformers_utils.utils import check_gguf_file
from vllm.utils import make_async
@@ -256,6 +258,18 @@ def get_tokenizer(
else:
raise e
# The special_tokens in tokenizer should also be
# controlled by do_lower_case in encoder_config
encoder_config = get_sentence_transformer_tokenizer_config(
tokenizer_name, revision)
if isinstance(encoder_config, dict) and encoder_config.get(
"do_lower_case", False):
special_tokens_map = {
k: v.lower()
for k, v in tokenizer.special_tokens_map.items()
}
tokenizer.add_special_tokens(special_tokens_map)
# NOTE: We can remove this after https://github.com/THUDM/ChatGLM3/issues/1324
if type(tokenizer).__name__ in ("ChatGLMTokenizer",
"ChatGLM4Tokenizer"):