[Bugfix] Fix Qwen-VL tokenizer implementation (#36140)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2026-03-06 00:07:19 +08:00
committed by GitHub
parent 176c799f4c
commit 7196348157
9 changed files with 118 additions and 66 deletions

View File

@@ -6,11 +6,9 @@
# Copyright (c) Alibaba Cloud.
"""Inference-only Qwen-VL model compatible with HuggingFace weights."""
import copy
import math
import unicodedata
from collections.abc import Callable, Collection, Mapping, Sequence, Set
from functools import lru_cache, partial
from collections.abc import Callable, Mapping, Sequence
from functools import partial
from typing import Annotated, Literal, TypeAlias
import regex as re
@@ -436,60 +434,6 @@ class QwenVLModel(QWenModel):
)
@lru_cache(maxsize=1)
def _get_tokenizer_without_image_pad(
tokenizer: PreTrainedTokenizer,
) -> PreTrainedTokenizer:
"""
The logic of adding image pad tokens should only be applied in
[`QwenVLProcessor`][vllm.model_executor.models.qwen_vl.QwenVLProcessor],
so they are patched out here.
The definition of the wrapped tokenizer can be found here:
https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
"""
new_tokenizer = copy.deepcopy(tokenizer)
class TokenizerWithoutImagePad(tokenizer.__class__): # type: ignore
def tokenize(
self,
text: str,
allowed_special: Set[str] | str = "all",
disallowed_special: Collection[str] | str = (),
**kwargs,
) -> list[bytes | str]:
text = unicodedata.normalize("NFC", text)
return [
self.decoder[t]
for t in self.tokenizer.encode(
text,
allowed_special=allowed_special,
disallowed_special=disallowed_special,
)
]
def _decode(
self,
token_ids: int | list[int],
skip_special_tokens: bool = False,
errors: str | None = None,
**kwargs,
) -> str:
if isinstance(token_ids, int):
token_ids = [token_ids]
return self.tokenizer.decode(
token_ids,
errors=errors or self.errors,
)
TokenizerWithoutImagePad.__name__ = f"{tokenizer.__class__.__name__}WithoutImagePad"
new_tokenizer.__class__ = TokenizerWithoutImagePad
return new_tokenizer
class QwenVLProcessor:
"""
This model doesn't define its own HF processor,
@@ -574,12 +518,6 @@ class QwenVLProcessor:
class QwenVLProcessingInfo(BaseProcessingInfo):
def get_tokenizer(self) -> PreTrainedTokenizer:
tokenizer = self.ctx.get_tokenizer()
assert isinstance(tokenizer, PreTrainedTokenizer)
return _get_tokenizer_without_image_pad(tokenizer)
def get_hf_processor(self, **kwargs: object) -> QwenVLProcessor:
return self.ctx.init_processor(
QwenVLProcessor,