diff --git a/tests/tokenizers_/test_basic.py b/tests/tokenizers_/test_basic.py index 1c1dd3338..cf0d8f53c 100644 --- a/tests/tokenizers_/test_basic.py +++ b/tests/tokenizers_/test_basic.py @@ -29,7 +29,8 @@ def test_tokenizer_like_protocol(): _assert_tokenizer_like(tokenizer) tokenizer = get_tokenizer( - "mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral" + "mistralai/Mistral-7B-Instruct-v0.3", + tokenizer_mode="mistral", ) assert isinstance(tokenizer, MistralTokenizer) _assert_tokenizer_like(tokenizer) @@ -40,11 +41,20 @@ def test_tokenizer_like_protocol(): tokenizer = get_tokenizer("deepseek-ai/DeepSeek-V3", tokenizer_mode="deepseek_v32") assert isinstance(tokenizer, HfTokenizer) + # Verify it's a fast tokenizer (required for FastIncrementalDetokenizer) assert isinstance(tokenizer, PreTrainedTokenizerFast) assert "DSV32" in tokenizer.__class__.__name__ _assert_tokenizer_like(tokenizer) + tokenizer = get_tokenizer( + "Qwen/Qwen-VL", + tokenizer_mode="qwen_vl", + trust_remote_code=True, + ) + assert isinstance(tokenizer, HfTokenizer) + assert "WithoutImagePad" in tokenizer.__class__.__name__ + @pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"]) def test_tokenizer_revision(tokenizer_name: str): diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index f8bf52de0..7c9a95ef1 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -1321,6 +1321,7 @@ def add_cli_args(parser: argparse.ArgumentParser): - "slow" will always use the slow tokenizer.\n - "mistral" will always use the tokenizer from `mistral_common`.\n - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n + - "qwen_vl" will always use the tokenizer from `qwen_vl`.\n - Other custom values can be supported via plugins.""", ) parser.add_argument("--use-beam-search", action="store_true") diff --git a/vllm/config/model.py b/vllm/config/model.py index 4e3568fa1..6c48bfde6 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -126,6 +126,7 @@ class ModelConfig: - "slow" will always use the slow tokenizer.\n - "mistral" will always use the tokenizer from `mistral_common`.\n - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n + - "qwen_vl" will always use the tokenizer from `qwen_vl`.\n - Other custom values can be supported via plugins.""" trust_remote_code: bool = False """Trust remote code (e.g., from HuggingFace) when downloading the model diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index 8ac541f73..1eb8ecc2d 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -6,11 +6,9 @@ # Copyright (c) Alibaba Cloud. """Inference-only Qwen-VL model compatible with HuggingFace weights.""" -import copy import math -import unicodedata -from collections.abc import Callable, Collection, Mapping, Sequence, Set -from functools import lru_cache, partial +from collections.abc import Callable, Mapping, Sequence +from functools import partial from typing import Annotated, Literal, TypeAlias import regex as re @@ -436,60 +434,6 @@ class QwenVLModel(QWenModel): ) -@lru_cache(maxsize=1) -def _get_tokenizer_without_image_pad( - tokenizer: PreTrainedTokenizer, -) -> PreTrainedTokenizer: - """ - The logic of adding image pad tokens should only be applied in - [`QwenVLProcessor`][vllm.model_executor.models.qwen_vl.QwenVLProcessor], - so they are patched out here. - - The definition of the wrapped tokenizer can be found here: - https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py - """ - new_tokenizer = copy.deepcopy(tokenizer) - - class TokenizerWithoutImagePad(tokenizer.__class__): # type: ignore - def tokenize( - self, - text: str, - allowed_special: Set[str] | str = "all", - disallowed_special: Collection[str] | str = (), - **kwargs, - ) -> list[bytes | str]: - text = unicodedata.normalize("NFC", text) - - return [ - self.decoder[t] - for t in self.tokenizer.encode( - text, - allowed_special=allowed_special, - disallowed_special=disallowed_special, - ) - ] - - def _decode( - self, - token_ids: int | list[int], - skip_special_tokens: bool = False, - errors: str | None = None, - **kwargs, - ) -> str: - if isinstance(token_ids, int): - token_ids = [token_ids] - - return self.tokenizer.decode( - token_ids, - errors=errors or self.errors, - ) - - TokenizerWithoutImagePad.__name__ = f"{tokenizer.__class__.__name__}WithoutImagePad" - - new_tokenizer.__class__ = TokenizerWithoutImagePad - return new_tokenizer - - class QwenVLProcessor: """ This model doesn't define its own HF processor, @@ -574,12 +518,6 @@ class QwenVLProcessor: class QwenVLProcessingInfo(BaseProcessingInfo): - def get_tokenizer(self) -> PreTrainedTokenizer: - tokenizer = self.ctx.get_tokenizer() - assert isinstance(tokenizer, PreTrainedTokenizer) - - return _get_tokenizer_without_image_pad(tokenizer) - def get_hf_processor(self, **kwargs: object) -> QwenVLProcessor: return self.ctx.init_processor( QwenVLProcessor, diff --git a/vllm/renderers/qwen_vl.py b/vllm/renderers/qwen_vl.py new file mode 100644 index 000000000..4b47d0216 --- /dev/null +++ b/vllm/renderers/qwen_vl.py @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any + +from vllm.config import VllmConfig +from vllm.tokenizers import cached_get_tokenizer +from vllm.tokenizers.qwen_vl import QwenVLTokenizer + +from .base import BaseRenderer +from .hf import HfRenderer + + +class QwenVLRenderer(BaseRenderer[QwenVLTokenizer]): + @classmethod + def from_config( # type: ignore[override] + cls, + config: VllmConfig, + tokenizer_kwargs: dict[str, Any], + ) -> "HfRenderer": + model_config = config.model_config + if model_config.skip_tokenizer_init: + tokenizer = None + else: + tokenizer = cached_get_tokenizer( + tokenizer_cls=QwenVLTokenizer, + **tokenizer_kwargs, + ) + + return HfRenderer(config, tokenizer) diff --git a/vllm/renderers/registry.py b/vllm/renderers/registry.py index cd09c80f9..de95505ec 100644 --- a/vllm/renderers/registry.py +++ b/vllm/renderers/registry.py @@ -20,6 +20,7 @@ _VLLM_RENDERERS = { "hf": ("hf", "HfRenderer"), "grok2": ("grok2", "Grok2Renderer"), "mistral": ("mistral", "MistralRenderer"), + "qwen_vl": ("qwen_vl", "QwenVLRenderer"), "terratorch": ("terratorch", "TerratorchRenderer"), } diff --git a/vllm/tokenizers/deepseek_v32.py b/vllm/tokenizers/deepseek_v32.py index 28071ef69..4525eaa34 100644 --- a/vllm/tokenizers/deepseek_v32.py +++ b/vllm/tokenizers/deepseek_v32.py @@ -7,9 +7,9 @@ from transformers import AutoTokenizer from vllm.entrypoints.chat_utils import ChatCompletionMessageParam -from . import TokenizerLike from .deepseek_v32_encoding import encode_messages from .hf import HfTokenizer, get_cached_tokenizer +from .protocol import TokenizerLike def get_deepseek_v32_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer: diff --git a/vllm/tokenizers/qwen_vl.py b/vllm/tokenizers/qwen_vl.py new file mode 100644 index 000000000..5b506df4d --- /dev/null +++ b/vllm/tokenizers/qwen_vl.py @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import copy +import unicodedata +from collections.abc import Collection, Set + +from transformers import AutoTokenizer + +from .hf import HfTokenizer, get_cached_tokenizer +from .protocol import TokenizerLike + + +def get_qwen_vl_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer: + """ + The logic of adding image pad tokens should only be applied in + `QwenVLProcessor`, so they are patched out here. + + The definition of the wrapped tokenizer can be found here: + https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py + """ + new_tokenizer = copy.copy(tokenizer) + + class TokenizerWithoutImagePad(tokenizer.__class__): # type: ignore + def tokenize( + self, + text: str, + allowed_special: Set[str] | str = "all", + disallowed_special: Collection[str] | str = (), + **kwargs, + ) -> list[bytes | str]: + text = unicodedata.normalize("NFC", text) + + return [ + self.decoder[t] + for t in self.tokenizer.encode( + text, + allowed_special=allowed_special, + disallowed_special=disallowed_special, + ) + ] + + def _decode( + self, + token_ids: int | list[int], + skip_special_tokens: bool = False, + errors: str | None = None, + **kwargs, + ) -> str: + if isinstance(token_ids, int): + token_ids = [token_ids] + + return self.tokenizer.decode( + token_ids, + errors=errors or self.errors, + ) + + TokenizerWithoutImagePad.__name__ = f"{tokenizer.__class__.__name__}WithoutImagePad" + + new_tokenizer.__class__ = TokenizerWithoutImagePad + return new_tokenizer + + +class QwenVLTokenizer(TokenizerLike): + @classmethod + def from_pretrained(cls, *args, **kwargs) -> HfTokenizer: + tokenizer = AutoTokenizer.from_pretrained(*args, **kwargs) + return get_cached_tokenizer(get_qwen_vl_tokenizer(tokenizer)) diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py index 2da7842b0..4512f766c 100644 --- a/vllm/tokenizers/registry.py +++ b/vllm/tokenizers/registry.py @@ -36,6 +36,7 @@ _VLLM_TOKENIZERS = { "grok2": ("grok2", "Grok2Tokenizer"), "hf": ("hf", "CachedHfTokenizer"), "mistral": ("mistral", "MistralTokenizer"), + "qwen_vl": ("qwen_vl", "QwenVLTokenizer"), } @@ -165,6 +166,10 @@ def resolve_tokenizer_args( ): tokenizer_mode = "grok2" + # Model-specific tokenizers + if tokenizer_mode == "auto" and "/Qwen-VL" in str(tokenizer_name): + tokenizer_mode = "qwen_vl" + # Fallback to HF tokenizer if tokenizer_mode == "auto": tokenizer_mode = "hf"