Compare commits
5 Commits
v0.17.0rc0
...
v0.17.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b31e9326a7 | ||
|
|
e346c08560 | ||
|
|
b7a423cb01 | ||
|
|
fa78ec8a72 | ||
|
|
9a474ce7a4 |
@@ -44,7 +44,7 @@ docker run \
|
|||||||
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
|
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
|
||||||
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
|
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
|
||||||
cd tests
|
cd tests
|
||||||
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
|
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
|
||||||
pytest -v -s v1/engine
|
pytest -v -s v1/engine
|
||||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
||||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||||
|
|||||||
@@ -54,10 +54,13 @@ mkdir -p $DIST_DIR
|
|||||||
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
|
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
|
||||||
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
|
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
|
||||||
echo "Wheels copied to local directory"
|
echo "Wheels copied to local directory"
|
||||||
# generate source tarball
|
# generate source distribution using setup.py
|
||||||
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" "$BUILDKITE_COMMIT"
|
python setup.py sdist --dist-dir=$DIST_DIR
|
||||||
ls -la $DIST_DIR
|
ls -la $DIST_DIR
|
||||||
|
|
||||||
|
SDIST_FILE=$(find $DIST_DIR -name "vllm*.tar.gz")
|
||||||
|
echo "Found sdist: $SDIST_FILE"
|
||||||
|
|
||||||
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
|
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
|
||||||
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
|
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
|
||||||
if [[ -z "$PYPI_WHEEL_FILES" ]]; then
|
if [[ -z "$PYPI_WHEEL_FILES" ]]; then
|
||||||
@@ -65,6 +68,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
python3 -m twine check "$PYPI_WHEEL_FILES"
|
python3 -m twine check "$PYPI_WHEEL_FILES" "$SDIST_FILE"
|
||||||
python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES"
|
python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES" "$SDIST_FILE"
|
||||||
echo "Wheels uploaded to PyPI"
|
echo "Wheels and source distribution uploaded to PyPI"
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ tokenizers >= 0.21.1 # Required for fast incremental detokenization.
|
|||||||
protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994
|
protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994
|
||||||
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
|
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
|
||||||
aiohttp >= 3.13.3
|
aiohttp >= 3.13.3
|
||||||
openai >= 1.99.1 # For Responses API with reasoning content
|
openai >= 1.99.1, < 2.25.0 # For Responses API with reasoning content
|
||||||
pydantic >= 2.12.0
|
pydantic >= 2.12.0
|
||||||
prometheus_client >= 0.18.0
|
prometheus_client >= 0.18.0
|
||||||
pillow # Required for image processing
|
pillow # Required for image processing
|
||||||
|
|||||||
@@ -15,4 +15,4 @@ torch==2.10.0+xpu
|
|||||||
torchaudio
|
torchaudio
|
||||||
torchvision
|
torchvision
|
||||||
|
|
||||||
vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.2/vllm_xpu_kernels-0.1.2-cp312-cp312-linux_x86_64.whl
|
vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.3/vllm_xpu_kernels-0.1.3-cp38-abi3-linux_x86_64.whl
|
||||||
|
|||||||
@@ -29,7 +29,8 @@ def test_tokenizer_like_protocol():
|
|||||||
_assert_tokenizer_like(tokenizer)
|
_assert_tokenizer_like(tokenizer)
|
||||||
|
|
||||||
tokenizer = get_tokenizer(
|
tokenizer = get_tokenizer(
|
||||||
"mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
|
"mistralai/Mistral-7B-Instruct-v0.3",
|
||||||
|
tokenizer_mode="mistral",
|
||||||
)
|
)
|
||||||
assert isinstance(tokenizer, MistralTokenizer)
|
assert isinstance(tokenizer, MistralTokenizer)
|
||||||
_assert_tokenizer_like(tokenizer)
|
_assert_tokenizer_like(tokenizer)
|
||||||
@@ -40,11 +41,20 @@ def test_tokenizer_like_protocol():
|
|||||||
|
|
||||||
tokenizer = get_tokenizer("deepseek-ai/DeepSeek-V3", tokenizer_mode="deepseek_v32")
|
tokenizer = get_tokenizer("deepseek-ai/DeepSeek-V3", tokenizer_mode="deepseek_v32")
|
||||||
assert isinstance(tokenizer, HfTokenizer)
|
assert isinstance(tokenizer, HfTokenizer)
|
||||||
|
|
||||||
# Verify it's a fast tokenizer (required for FastIncrementalDetokenizer)
|
# Verify it's a fast tokenizer (required for FastIncrementalDetokenizer)
|
||||||
assert isinstance(tokenizer, PreTrainedTokenizerFast)
|
assert isinstance(tokenizer, PreTrainedTokenizerFast)
|
||||||
assert "DSV32" in tokenizer.__class__.__name__
|
assert "DSV32" in tokenizer.__class__.__name__
|
||||||
_assert_tokenizer_like(tokenizer)
|
_assert_tokenizer_like(tokenizer)
|
||||||
|
|
||||||
|
tokenizer = get_tokenizer(
|
||||||
|
"Qwen/Qwen-VL",
|
||||||
|
tokenizer_mode="qwen_vl",
|
||||||
|
trust_remote_code=True,
|
||||||
|
)
|
||||||
|
assert isinstance(tokenizer, HfTokenizer)
|
||||||
|
assert "WithoutImagePad" in tokenizer.__class__.__name__
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"])
|
@pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"])
|
||||||
def test_tokenizer_revision(tokenizer_name: str):
|
def test_tokenizer_revision(tokenizer_name: str):
|
||||||
|
|||||||
@@ -1321,6 +1321,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
|||||||
- "slow" will always use the slow tokenizer.\n
|
- "slow" will always use the slow tokenizer.\n
|
||||||
- "mistral" will always use the tokenizer from `mistral_common`.\n
|
- "mistral" will always use the tokenizer from `mistral_common`.\n
|
||||||
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
|
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
|
||||||
|
- "qwen_vl" will always use the tokenizer from `qwen_vl`.\n
|
||||||
- Other custom values can be supported via plugins.""",
|
- Other custom values can be supported via plugins.""",
|
||||||
)
|
)
|
||||||
parser.add_argument("--use-beam-search", action="store_true")
|
parser.add_argument("--use-beam-search", action="store_true")
|
||||||
|
|||||||
@@ -126,6 +126,7 @@ class ModelConfig:
|
|||||||
- "slow" will always use the slow tokenizer.\n
|
- "slow" will always use the slow tokenizer.\n
|
||||||
- "mistral" will always use the tokenizer from `mistral_common`.\n
|
- "mistral" will always use the tokenizer from `mistral_common`.\n
|
||||||
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
|
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
|
||||||
|
- "qwen_vl" will always use the tokenizer from `qwen_vl`.\n
|
||||||
- Other custom values can be supported via plugins."""
|
- Other custom values can be supported via plugins."""
|
||||||
trust_remote_code: bool = False
|
trust_remote_code: bool = False
|
||||||
"""Trust remote code (e.g., from HuggingFace) when downloading the model
|
"""Trust remote code (e.g., from HuggingFace) when downloading the model
|
||||||
|
|||||||
@@ -353,6 +353,39 @@ class Qwen2_5OmniThinkerProcessingInfo(
|
|||||||
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
|
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
|
||||||
return {"audio": None, "image": None, "video": None}
|
return {"audio": None, "image": None, "video": None}
|
||||||
|
|
||||||
|
def get_mm_max_tokens_per_item(
|
||||||
|
self,
|
||||||
|
seq_len: int,
|
||||||
|
mm_counts: Mapping[str, int] | None = None,
|
||||||
|
) -> Mapping[str, int] | None:
|
||||||
|
mm_counts = mm_counts or {}
|
||||||
|
requested_modalities = {m for m, c in mm_counts.items() if c > 0}
|
||||||
|
mm_max_tokens: dict[str, int] = {}
|
||||||
|
|
||||||
|
if requested_modalities & {"image", "video"}:
|
||||||
|
vl_tokens = Qwen2_5_VLProcessingInfo.get_mm_max_tokens_per_item(
|
||||||
|
self,
|
||||||
|
seq_len=seq_len,
|
||||||
|
mm_counts=mm_counts,
|
||||||
|
)
|
||||||
|
mm_max_tokens.update(
|
||||||
|
{
|
||||||
|
m: vl_tokens[m]
|
||||||
|
for m in ["image", "video"]
|
||||||
|
if m in requested_modalities
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
if "audio" in requested_modalities:
|
||||||
|
audio_tokens = Qwen2AudioProcessingInfo.get_mm_max_tokens_per_item(
|
||||||
|
self,
|
||||||
|
seq_len=seq_len,
|
||||||
|
mm_counts=mm_counts,
|
||||||
|
)
|
||||||
|
mm_max_tokens["audio"] = audio_tokens["audio"]
|
||||||
|
|
||||||
|
return mm_max_tokens
|
||||||
|
|
||||||
|
|
||||||
class Qwen2_5OmniThinkerDummyInputsBuilder(
|
class Qwen2_5OmniThinkerDummyInputsBuilder(
|
||||||
BaseDummyInputsBuilder[Qwen2_5OmniThinkerProcessingInfo]
|
BaseDummyInputsBuilder[Qwen2_5OmniThinkerProcessingInfo]
|
||||||
|
|||||||
@@ -179,6 +179,26 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo):
|
|||||||
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
|
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
|
||||||
return {"audio": None}
|
return {"audio": None}
|
||||||
|
|
||||||
|
def get_mm_max_tokens_per_item(
|
||||||
|
self,
|
||||||
|
seq_len: int,
|
||||||
|
mm_counts: Mapping[str, int] | None = None,
|
||||||
|
) -> Mapping[str, int]:
|
||||||
|
mm_counts = mm_counts or {}
|
||||||
|
if mm_counts.get("audio", 0) <= 0:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
feature_extractor = self.get_feature_extractor()
|
||||||
|
chunk_length = min(feature_extractor.chunk_length, 30)
|
||||||
|
audio_len = int(chunk_length * feature_extractor.sampling_rate)
|
||||||
|
hop_length = feature_extractor.hop_length
|
||||||
|
max_mel_seq_len = audio_len // hop_length
|
||||||
|
|
||||||
|
input_lengths = torch.tensor([max_mel_seq_len], dtype=torch.long)
|
||||||
|
_, output_lengths = _get_feat_extract_output_lengths(input_lengths)
|
||||||
|
|
||||||
|
return {"audio": int(output_lengths.item())}
|
||||||
|
|
||||||
|
|
||||||
class Qwen2AudioDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]):
|
class Qwen2AudioDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]):
|
||||||
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
|
|||||||
@@ -1163,6 +1163,39 @@ class Qwen3OmniMoeThinkerProcessingInfo(
|
|||||||
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
|
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
|
||||||
return {"audio": None, "image": None, "video": None}
|
return {"audio": None, "image": None, "video": None}
|
||||||
|
|
||||||
|
def get_mm_max_tokens_per_item(
|
||||||
|
self,
|
||||||
|
seq_len: int,
|
||||||
|
mm_counts: Mapping[str, int] | None = None,
|
||||||
|
) -> Mapping[str, int] | None:
|
||||||
|
mm_counts = mm_counts or {}
|
||||||
|
requested_modalities = {m for m, c in mm_counts.items() if c > 0}
|
||||||
|
mm_max_tokens: dict[str, int] = {}
|
||||||
|
|
||||||
|
if requested_modalities & {"image", "video"}:
|
||||||
|
vl_tokens = Qwen2_5_VLProcessingInfo.get_mm_max_tokens_per_item(
|
||||||
|
self,
|
||||||
|
seq_len=seq_len,
|
||||||
|
mm_counts=mm_counts,
|
||||||
|
)
|
||||||
|
mm_max_tokens.update(
|
||||||
|
{
|
||||||
|
m: vl_tokens[m]
|
||||||
|
for m in ["image", "video"]
|
||||||
|
if m in requested_modalities
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
if "audio" in requested_modalities:
|
||||||
|
audio_tokens = Qwen2AudioProcessingInfo.get_mm_max_tokens_per_item(
|
||||||
|
self,
|
||||||
|
seq_len=seq_len,
|
||||||
|
mm_counts=mm_counts,
|
||||||
|
)
|
||||||
|
mm_max_tokens["audio"] = audio_tokens["audio"]
|
||||||
|
|
||||||
|
return mm_max_tokens
|
||||||
|
|
||||||
|
|
||||||
Qwen3OmniMoeThinkerDummyInputsBuilder = Qwen2_5OmniThinkerDummyInputsBuilder
|
Qwen3OmniMoeThinkerDummyInputsBuilder = Qwen2_5OmniThinkerDummyInputsBuilder
|
||||||
|
|
||||||
|
|||||||
@@ -6,11 +6,9 @@
|
|||||||
# Copyright (c) Alibaba Cloud.
|
# Copyright (c) Alibaba Cloud.
|
||||||
"""Inference-only Qwen-VL model compatible with HuggingFace weights."""
|
"""Inference-only Qwen-VL model compatible with HuggingFace weights."""
|
||||||
|
|
||||||
import copy
|
|
||||||
import math
|
import math
|
||||||
import unicodedata
|
from collections.abc import Callable, Mapping, Sequence
|
||||||
from collections.abc import Callable, Collection, Mapping, Sequence, Set
|
from functools import partial
|
||||||
from functools import lru_cache, partial
|
|
||||||
from typing import Annotated, Literal, TypeAlias
|
from typing import Annotated, Literal, TypeAlias
|
||||||
|
|
||||||
import regex as re
|
import regex as re
|
||||||
@@ -436,60 +434,6 @@ class QwenVLModel(QWenModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=1)
|
|
||||||
def _get_tokenizer_without_image_pad(
|
|
||||||
tokenizer: PreTrainedTokenizer,
|
|
||||||
) -> PreTrainedTokenizer:
|
|
||||||
"""
|
|
||||||
The logic of adding image pad tokens should only be applied in
|
|
||||||
[`QwenVLProcessor`][vllm.model_executor.models.qwen_vl.QwenVLProcessor],
|
|
||||||
so they are patched out here.
|
|
||||||
|
|
||||||
The definition of the wrapped tokenizer can be found here:
|
|
||||||
https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
|
|
||||||
"""
|
|
||||||
new_tokenizer = copy.deepcopy(tokenizer)
|
|
||||||
|
|
||||||
class TokenizerWithoutImagePad(tokenizer.__class__): # type: ignore
|
|
||||||
def tokenize(
|
|
||||||
self,
|
|
||||||
text: str,
|
|
||||||
allowed_special: Set[str] | str = "all",
|
|
||||||
disallowed_special: Collection[str] | str = (),
|
|
||||||
**kwargs,
|
|
||||||
) -> list[bytes | str]:
|
|
||||||
text = unicodedata.normalize("NFC", text)
|
|
||||||
|
|
||||||
return [
|
|
||||||
self.decoder[t]
|
|
||||||
for t in self.tokenizer.encode(
|
|
||||||
text,
|
|
||||||
allowed_special=allowed_special,
|
|
||||||
disallowed_special=disallowed_special,
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
def _decode(
|
|
||||||
self,
|
|
||||||
token_ids: int | list[int],
|
|
||||||
skip_special_tokens: bool = False,
|
|
||||||
errors: str | None = None,
|
|
||||||
**kwargs,
|
|
||||||
) -> str:
|
|
||||||
if isinstance(token_ids, int):
|
|
||||||
token_ids = [token_ids]
|
|
||||||
|
|
||||||
return self.tokenizer.decode(
|
|
||||||
token_ids,
|
|
||||||
errors=errors or self.errors,
|
|
||||||
)
|
|
||||||
|
|
||||||
TokenizerWithoutImagePad.__name__ = f"{tokenizer.__class__.__name__}WithoutImagePad"
|
|
||||||
|
|
||||||
new_tokenizer.__class__ = TokenizerWithoutImagePad
|
|
||||||
return new_tokenizer
|
|
||||||
|
|
||||||
|
|
||||||
class QwenVLProcessor:
|
class QwenVLProcessor:
|
||||||
"""
|
"""
|
||||||
This model doesn't define its own HF processor,
|
This model doesn't define its own HF processor,
|
||||||
@@ -574,12 +518,6 @@ class QwenVLProcessor:
|
|||||||
|
|
||||||
|
|
||||||
class QwenVLProcessingInfo(BaseProcessingInfo):
|
class QwenVLProcessingInfo(BaseProcessingInfo):
|
||||||
def get_tokenizer(self) -> PreTrainedTokenizer:
|
|
||||||
tokenizer = self.ctx.get_tokenizer()
|
|
||||||
assert isinstance(tokenizer, PreTrainedTokenizer)
|
|
||||||
|
|
||||||
return _get_tokenizer_without_image_pad(tokenizer)
|
|
||||||
|
|
||||||
def get_hf_processor(self, **kwargs: object) -> QwenVLProcessor:
|
def get_hf_processor(self, **kwargs: object) -> QwenVLProcessor:
|
||||||
return self.ctx.init_processor(
|
return self.ctx.init_processor(
|
||||||
QwenVLProcessor,
|
QwenVLProcessor,
|
||||||
|
|||||||
29
vllm/renderers/qwen_vl.py
Normal file
29
vllm/renderers/qwen_vl.py
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from vllm.config import VllmConfig
|
||||||
|
from vllm.tokenizers import cached_get_tokenizer
|
||||||
|
from vllm.tokenizers.qwen_vl import QwenVLTokenizer
|
||||||
|
|
||||||
|
from .base import BaseRenderer
|
||||||
|
from .hf import HfRenderer
|
||||||
|
|
||||||
|
|
||||||
|
class QwenVLRenderer(BaseRenderer[QwenVLTokenizer]):
|
||||||
|
@classmethod
|
||||||
|
def from_config( # type: ignore[override]
|
||||||
|
cls,
|
||||||
|
config: VllmConfig,
|
||||||
|
tokenizer_kwargs: dict[str, Any],
|
||||||
|
) -> "HfRenderer":
|
||||||
|
model_config = config.model_config
|
||||||
|
if model_config.skip_tokenizer_init:
|
||||||
|
tokenizer = None
|
||||||
|
else:
|
||||||
|
tokenizer = cached_get_tokenizer(
|
||||||
|
tokenizer_cls=QwenVLTokenizer,
|
||||||
|
**tokenizer_kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
return HfRenderer(config, tokenizer)
|
||||||
@@ -20,6 +20,7 @@ _VLLM_RENDERERS = {
|
|||||||
"hf": ("hf", "HfRenderer"),
|
"hf": ("hf", "HfRenderer"),
|
||||||
"grok2": ("grok2", "Grok2Renderer"),
|
"grok2": ("grok2", "Grok2Renderer"),
|
||||||
"mistral": ("mistral", "MistralRenderer"),
|
"mistral": ("mistral", "MistralRenderer"),
|
||||||
|
"qwen_vl": ("qwen_vl", "QwenVLRenderer"),
|
||||||
"terratorch": ("terratorch", "TerratorchRenderer"),
|
"terratorch": ("terratorch", "TerratorchRenderer"),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -7,9 +7,9 @@ from transformers import AutoTokenizer
|
|||||||
|
|
||||||
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
|
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
|
||||||
|
|
||||||
from . import TokenizerLike
|
|
||||||
from .deepseek_v32_encoding import encode_messages
|
from .deepseek_v32_encoding import encode_messages
|
||||||
from .hf import HfTokenizer, get_cached_tokenizer
|
from .hf import HfTokenizer, get_cached_tokenizer
|
||||||
|
from .protocol import TokenizerLike
|
||||||
|
|
||||||
|
|
||||||
def get_deepseek_v32_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer:
|
def get_deepseek_v32_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer:
|
||||||
|
|||||||
67
vllm/tokenizers/qwen_vl.py
Normal file
67
vllm/tokenizers/qwen_vl.py
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import copy
|
||||||
|
import unicodedata
|
||||||
|
from collections.abc import Collection, Set
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
from .hf import HfTokenizer, get_cached_tokenizer
|
||||||
|
from .protocol import TokenizerLike
|
||||||
|
|
||||||
|
|
||||||
|
def get_qwen_vl_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer:
|
||||||
|
"""
|
||||||
|
The logic of adding image pad tokens should only be applied in
|
||||||
|
`QwenVLProcessor`, so they are patched out here.
|
||||||
|
|
||||||
|
The definition of the wrapped tokenizer can be found here:
|
||||||
|
https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
|
||||||
|
"""
|
||||||
|
new_tokenizer = copy.copy(tokenizer)
|
||||||
|
|
||||||
|
class TokenizerWithoutImagePad(tokenizer.__class__): # type: ignore
|
||||||
|
def tokenize(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
allowed_special: Set[str] | str = "all",
|
||||||
|
disallowed_special: Collection[str] | str = (),
|
||||||
|
**kwargs,
|
||||||
|
) -> list[bytes | str]:
|
||||||
|
text = unicodedata.normalize("NFC", text)
|
||||||
|
|
||||||
|
return [
|
||||||
|
self.decoder[t]
|
||||||
|
for t in self.tokenizer.encode(
|
||||||
|
text,
|
||||||
|
allowed_special=allowed_special,
|
||||||
|
disallowed_special=disallowed_special,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def _decode(
|
||||||
|
self,
|
||||||
|
token_ids: int | list[int],
|
||||||
|
skip_special_tokens: bool = False,
|
||||||
|
errors: str | None = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> str:
|
||||||
|
if isinstance(token_ids, int):
|
||||||
|
token_ids = [token_ids]
|
||||||
|
|
||||||
|
return self.tokenizer.decode(
|
||||||
|
token_ids,
|
||||||
|
errors=errors or self.errors,
|
||||||
|
)
|
||||||
|
|
||||||
|
TokenizerWithoutImagePad.__name__ = f"{tokenizer.__class__.__name__}WithoutImagePad"
|
||||||
|
|
||||||
|
new_tokenizer.__class__ = TokenizerWithoutImagePad
|
||||||
|
return new_tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
class QwenVLTokenizer(TokenizerLike):
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, *args, **kwargs) -> HfTokenizer:
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(*args, **kwargs)
|
||||||
|
return get_cached_tokenizer(get_qwen_vl_tokenizer(tokenizer))
|
||||||
@@ -36,6 +36,7 @@ _VLLM_TOKENIZERS = {
|
|||||||
"grok2": ("grok2", "Grok2Tokenizer"),
|
"grok2": ("grok2", "Grok2Tokenizer"),
|
||||||
"hf": ("hf", "CachedHfTokenizer"),
|
"hf": ("hf", "CachedHfTokenizer"),
|
||||||
"mistral": ("mistral", "MistralTokenizer"),
|
"mistral": ("mistral", "MistralTokenizer"),
|
||||||
|
"qwen_vl": ("qwen_vl", "QwenVLTokenizer"),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -165,6 +166,10 @@ def resolve_tokenizer_args(
|
|||||||
):
|
):
|
||||||
tokenizer_mode = "grok2"
|
tokenizer_mode = "grok2"
|
||||||
|
|
||||||
|
# Model-specific tokenizers
|
||||||
|
if tokenizer_mode == "auto" and "/Qwen-VL" in str(tokenizer_name):
|
||||||
|
tokenizer_mode = "qwen_vl"
|
||||||
|
|
||||||
# Fallback to HF tokenizer
|
# Fallback to HF tokenizer
|
||||||
if tokenizer_mode == "auto":
|
if tokenizer_mode == "auto":
|
||||||
tokenizer_mode = "hf"
|
tokenizer_mode = "hf"
|
||||||
|
|||||||
Reference in New Issue
Block a user