Compare commits

...

5 Commits

Author SHA1 Message Date
khluu
b31e9326a7 Bound openai to under 2.25.0
Signed-off-by: khluu <khluu000@gmail.com>
2026-03-06 13:04:15 -08:00
Doug Smith
e346c08560 [Release] Include source distribution (sdist) in PyPI uploads (#35136)
Signed-off-by: dougbtv <dosmith@redhat.com>
Co-authored-by: Daniele Trifirò <dtrifiro@redhat.com>
(cherry picked from commit 0bfa229bf1)
2026-03-06 13:03:53 -08:00
Avery Miao
b7a423cb01 [BUGFIX]Fix Qwen-Omni models audio max_token_per_item estimation error leading to encoder_cache_size is 0 (#35994)
Signed-off-by: Miao, Avery <avery.miao@intel.com>
(cherry picked from commit e998fa76b9)
2026-03-06 13:03:40 -08:00
Cyrus Leung
fa78ec8a72 [Bugfix] Fix Qwen-VL tokenizer implementation (#36140)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
(cherry picked from commit 7196348157)
2026-03-06 13:03:26 -08:00
Kunshang Ji
9a474ce7a4 [XPU] bump vllm-xpu-kernels to v0.1.3 (#35984)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
(cherry picked from commit a8f66cbde8)
2026-03-06 13:03:05 -08:00
16 changed files with 215 additions and 74 deletions

View File

@@ -44,7 +44,7 @@ docker run \
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
cd tests
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
pytest -v -s v1/engine
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py

View File

@@ -54,10 +54,13 @@ mkdir -p $DIST_DIR
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
echo "Wheels copied to local directory"
# generate source tarball
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" "$BUILDKITE_COMMIT"
# generate source distribution using setup.py
python setup.py sdist --dist-dir=$DIST_DIR
ls -la $DIST_DIR
SDIST_FILE=$(find $DIST_DIR -name "vllm*.tar.gz")
echo "Found sdist: $SDIST_FILE"
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
if [[ -z "$PYPI_WHEEL_FILES" ]]; then
@@ -65,6 +68,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then
exit 1
fi
python3 -m twine check "$PYPI_WHEEL_FILES"
python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES"
echo "Wheels uploaded to PyPI"
python3 -m twine check "$PYPI_WHEEL_FILES" "$SDIST_FILE"
python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES" "$SDIST_FILE"
echo "Wheels and source distribution uploaded to PyPI"

View File

@@ -12,7 +12,7 @@ tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
aiohttp >= 3.13.3
openai >= 1.99.1 # For Responses API with reasoning content
openai >= 1.99.1, < 2.25.0 # For Responses API with reasoning content
pydantic >= 2.12.0
prometheus_client >= 0.18.0
pillow # Required for image processing

View File

@@ -15,4 +15,4 @@ torch==2.10.0+xpu
torchaudio
torchvision
vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.2/vllm_xpu_kernels-0.1.2-cp312-cp312-linux_x86_64.whl
vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.3/vllm_xpu_kernels-0.1.3-cp38-abi3-linux_x86_64.whl

View File

@@ -29,7 +29,8 @@ def test_tokenizer_like_protocol():
_assert_tokenizer_like(tokenizer)
tokenizer = get_tokenizer(
"mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
"mistralai/Mistral-7B-Instruct-v0.3",
tokenizer_mode="mistral",
)
assert isinstance(tokenizer, MistralTokenizer)
_assert_tokenizer_like(tokenizer)
@@ -40,11 +41,20 @@ def test_tokenizer_like_protocol():
tokenizer = get_tokenizer("deepseek-ai/DeepSeek-V3", tokenizer_mode="deepseek_v32")
assert isinstance(tokenizer, HfTokenizer)
# Verify it's a fast tokenizer (required for FastIncrementalDetokenizer)
assert isinstance(tokenizer, PreTrainedTokenizerFast)
assert "DSV32" in tokenizer.__class__.__name__
_assert_tokenizer_like(tokenizer)
tokenizer = get_tokenizer(
"Qwen/Qwen-VL",
tokenizer_mode="qwen_vl",
trust_remote_code=True,
)
assert isinstance(tokenizer, HfTokenizer)
assert "WithoutImagePad" in tokenizer.__class__.__name__
@pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"])
def test_tokenizer_revision(tokenizer_name: str):

View File

@@ -1321,6 +1321,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
- "slow" will always use the slow tokenizer.\n
- "mistral" will always use the tokenizer from `mistral_common`.\n
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
- "qwen_vl" will always use the tokenizer from `qwen_vl`.\n
- Other custom values can be supported via plugins.""",
)
parser.add_argument("--use-beam-search", action="store_true")

View File

@@ -126,6 +126,7 @@ class ModelConfig:
- "slow" will always use the slow tokenizer.\n
- "mistral" will always use the tokenizer from `mistral_common`.\n
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
- "qwen_vl" will always use the tokenizer from `qwen_vl`.\n
- Other custom values can be supported via plugins."""
trust_remote_code: bool = False
"""Trust remote code (e.g., from HuggingFace) when downloading the model

View File

@@ -353,6 +353,39 @@ class Qwen2_5OmniThinkerProcessingInfo(
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
return {"audio": None, "image": None, "video": None}
def get_mm_max_tokens_per_item(
self,
seq_len: int,
mm_counts: Mapping[str, int] | None = None,
) -> Mapping[str, int] | None:
mm_counts = mm_counts or {}
requested_modalities = {m for m, c in mm_counts.items() if c > 0}
mm_max_tokens: dict[str, int] = {}
if requested_modalities & {"image", "video"}:
vl_tokens = Qwen2_5_VLProcessingInfo.get_mm_max_tokens_per_item(
self,
seq_len=seq_len,
mm_counts=mm_counts,
)
mm_max_tokens.update(
{
m: vl_tokens[m]
for m in ["image", "video"]
if m in requested_modalities
}
)
if "audio" in requested_modalities:
audio_tokens = Qwen2AudioProcessingInfo.get_mm_max_tokens_per_item(
self,
seq_len=seq_len,
mm_counts=mm_counts,
)
mm_max_tokens["audio"] = audio_tokens["audio"]
return mm_max_tokens
class Qwen2_5OmniThinkerDummyInputsBuilder(
BaseDummyInputsBuilder[Qwen2_5OmniThinkerProcessingInfo]

View File

@@ -179,6 +179,26 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo):
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
return {"audio": None}
def get_mm_max_tokens_per_item(
self,
seq_len: int,
mm_counts: Mapping[str, int] | None = None,
) -> Mapping[str, int]:
mm_counts = mm_counts or {}
if mm_counts.get("audio", 0) <= 0:
return {}
feature_extractor = self.get_feature_extractor()
chunk_length = min(feature_extractor.chunk_length, 30)
audio_len = int(chunk_length * feature_extractor.sampling_rate)
hop_length = feature_extractor.hop_length
max_mel_seq_len = audio_len // hop_length
input_lengths = torch.tensor([max_mel_seq_len], dtype=torch.long)
_, output_lengths = _get_feat_extract_output_lengths(input_lengths)
return {"audio": int(output_lengths.item())}
class Qwen2AudioDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]):
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:

View File

@@ -1163,6 +1163,39 @@ class Qwen3OmniMoeThinkerProcessingInfo(
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
return {"audio": None, "image": None, "video": None}
def get_mm_max_tokens_per_item(
self,
seq_len: int,
mm_counts: Mapping[str, int] | None = None,
) -> Mapping[str, int] | None:
mm_counts = mm_counts or {}
requested_modalities = {m for m, c in mm_counts.items() if c > 0}
mm_max_tokens: dict[str, int] = {}
if requested_modalities & {"image", "video"}:
vl_tokens = Qwen2_5_VLProcessingInfo.get_mm_max_tokens_per_item(
self,
seq_len=seq_len,
mm_counts=mm_counts,
)
mm_max_tokens.update(
{
m: vl_tokens[m]
for m in ["image", "video"]
if m in requested_modalities
}
)
if "audio" in requested_modalities:
audio_tokens = Qwen2AudioProcessingInfo.get_mm_max_tokens_per_item(
self,
seq_len=seq_len,
mm_counts=mm_counts,
)
mm_max_tokens["audio"] = audio_tokens["audio"]
return mm_max_tokens
Qwen3OmniMoeThinkerDummyInputsBuilder = Qwen2_5OmniThinkerDummyInputsBuilder

View File

@@ -6,11 +6,9 @@
# Copyright (c) Alibaba Cloud.
"""Inference-only Qwen-VL model compatible with HuggingFace weights."""
import copy
import math
import unicodedata
from collections.abc import Callable, Collection, Mapping, Sequence, Set
from functools import lru_cache, partial
from collections.abc import Callable, Mapping, Sequence
from functools import partial
from typing import Annotated, Literal, TypeAlias
import regex as re
@@ -436,60 +434,6 @@ class QwenVLModel(QWenModel):
)
@lru_cache(maxsize=1)
def _get_tokenizer_without_image_pad(
tokenizer: PreTrainedTokenizer,
) -> PreTrainedTokenizer:
"""
The logic of adding image pad tokens should only be applied in
[`QwenVLProcessor`][vllm.model_executor.models.qwen_vl.QwenVLProcessor],
so they are patched out here.
The definition of the wrapped tokenizer can be found here:
https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
"""
new_tokenizer = copy.deepcopy(tokenizer)
class TokenizerWithoutImagePad(tokenizer.__class__): # type: ignore
def tokenize(
self,
text: str,
allowed_special: Set[str] | str = "all",
disallowed_special: Collection[str] | str = (),
**kwargs,
) -> list[bytes | str]:
text = unicodedata.normalize("NFC", text)
return [
self.decoder[t]
for t in self.tokenizer.encode(
text,
allowed_special=allowed_special,
disallowed_special=disallowed_special,
)
]
def _decode(
self,
token_ids: int | list[int],
skip_special_tokens: bool = False,
errors: str | None = None,
**kwargs,
) -> str:
if isinstance(token_ids, int):
token_ids = [token_ids]
return self.tokenizer.decode(
token_ids,
errors=errors or self.errors,
)
TokenizerWithoutImagePad.__name__ = f"{tokenizer.__class__.__name__}WithoutImagePad"
new_tokenizer.__class__ = TokenizerWithoutImagePad
return new_tokenizer
class QwenVLProcessor:
"""
This model doesn't define its own HF processor,
@@ -574,12 +518,6 @@ class QwenVLProcessor:
class QwenVLProcessingInfo(BaseProcessingInfo):
def get_tokenizer(self) -> PreTrainedTokenizer:
tokenizer = self.ctx.get_tokenizer()
assert isinstance(tokenizer, PreTrainedTokenizer)
return _get_tokenizer_without_image_pad(tokenizer)
def get_hf_processor(self, **kwargs: object) -> QwenVLProcessor:
return self.ctx.init_processor(
QwenVLProcessor,

29
vllm/renderers/qwen_vl.py Normal file
View File

@@ -0,0 +1,29 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
from vllm.config import VllmConfig
from vllm.tokenizers import cached_get_tokenizer
from vllm.tokenizers.qwen_vl import QwenVLTokenizer
from .base import BaseRenderer
from .hf import HfRenderer
class QwenVLRenderer(BaseRenderer[QwenVLTokenizer]):
@classmethod
def from_config( # type: ignore[override]
cls,
config: VllmConfig,
tokenizer_kwargs: dict[str, Any],
) -> "HfRenderer":
model_config = config.model_config
if model_config.skip_tokenizer_init:
tokenizer = None
else:
tokenizer = cached_get_tokenizer(
tokenizer_cls=QwenVLTokenizer,
**tokenizer_kwargs,
)
return HfRenderer(config, tokenizer)

View File

@@ -20,6 +20,7 @@ _VLLM_RENDERERS = {
"hf": ("hf", "HfRenderer"),
"grok2": ("grok2", "Grok2Renderer"),
"mistral": ("mistral", "MistralRenderer"),
"qwen_vl": ("qwen_vl", "QwenVLRenderer"),
"terratorch": ("terratorch", "TerratorchRenderer"),
}

View File

@@ -7,9 +7,9 @@ from transformers import AutoTokenizer
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from . import TokenizerLike
from .deepseek_v32_encoding import encode_messages
from .hf import HfTokenizer, get_cached_tokenizer
from .protocol import TokenizerLike
def get_deepseek_v32_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer:

View File

@@ -0,0 +1,67 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copy
import unicodedata
from collections.abc import Collection, Set
from transformers import AutoTokenizer
from .hf import HfTokenizer, get_cached_tokenizer
from .protocol import TokenizerLike
def get_qwen_vl_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer:
"""
The logic of adding image pad tokens should only be applied in
`QwenVLProcessor`, so they are patched out here.
The definition of the wrapped tokenizer can be found here:
https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
"""
new_tokenizer = copy.copy(tokenizer)
class TokenizerWithoutImagePad(tokenizer.__class__): # type: ignore
def tokenize(
self,
text: str,
allowed_special: Set[str] | str = "all",
disallowed_special: Collection[str] | str = (),
**kwargs,
) -> list[bytes | str]:
text = unicodedata.normalize("NFC", text)
return [
self.decoder[t]
for t in self.tokenizer.encode(
text,
allowed_special=allowed_special,
disallowed_special=disallowed_special,
)
]
def _decode(
self,
token_ids: int | list[int],
skip_special_tokens: bool = False,
errors: str | None = None,
**kwargs,
) -> str:
if isinstance(token_ids, int):
token_ids = [token_ids]
return self.tokenizer.decode(
token_ids,
errors=errors or self.errors,
)
TokenizerWithoutImagePad.__name__ = f"{tokenizer.__class__.__name__}WithoutImagePad"
new_tokenizer.__class__ = TokenizerWithoutImagePad
return new_tokenizer
class QwenVLTokenizer(TokenizerLike):
@classmethod
def from_pretrained(cls, *args, **kwargs) -> HfTokenizer:
tokenizer = AutoTokenizer.from_pretrained(*args, **kwargs)
return get_cached_tokenizer(get_qwen_vl_tokenizer(tokenizer))

View File

@@ -36,6 +36,7 @@ _VLLM_TOKENIZERS = {
"grok2": ("grok2", "Grok2Tokenizer"),
"hf": ("hf", "CachedHfTokenizer"),
"mistral": ("mistral", "MistralTokenizer"),
"qwen_vl": ("qwen_vl", "QwenVLTokenizer"),
}
@@ -165,6 +166,10 @@ def resolve_tokenizer_args(
):
tokenizer_mode = "grok2"
# Model-specific tokenizers
if tokenizer_mode == "auto" and "/Qwen-VL" in str(tokenizer_name):
tokenizer_mode = "qwen_vl"
# Fallback to HF tokenizer
if tokenizer_mode == "auto":
tokenizer_mode = "hf"