Compare commits

..

5 Commits

Author SHA1 Message Date
khluu
b31e9326a7 Bound openai to under 2.25.0
Signed-off-by: khluu <khluu000@gmail.com>
2026-03-06 13:04:15 -08:00
Doug Smith
e346c08560 [Release] Include source distribution (sdist) in PyPI uploads (#35136)
Signed-off-by: dougbtv <dosmith@redhat.com>
Co-authored-by: Daniele Trifirò <dtrifiro@redhat.com>
(cherry picked from commit 0bfa229bf1)
2026-03-06 13:03:53 -08:00
Avery Miao
b7a423cb01 [BUGFIX]Fix Qwen-Omni models audio max_token_per_item estimation error leading to encoder_cache_size is 0 (#35994)
Signed-off-by: Miao, Avery <avery.miao@intel.com>
(cherry picked from commit e998fa76b9)
2026-03-06 13:03:40 -08:00
Cyrus Leung
fa78ec8a72 [Bugfix] Fix Qwen-VL tokenizer implementation (#36140)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
(cherry picked from commit 7196348157)
2026-03-06 13:03:26 -08:00
Kunshang Ji
9a474ce7a4 [XPU] bump vllm-xpu-kernels to v0.1.3 (#35984)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
(cherry picked from commit a8f66cbde8)
2026-03-06 13:03:05 -08:00
16 changed files with 215 additions and 74 deletions

View File

@@ -44,7 +44,7 @@ docker run \
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
cd tests cd tests
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
pytest -v -s v1/engine pytest -v -s v1/engine
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py

View File

@@ -54,10 +54,13 @@ mkdir -p $DIST_DIR
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64') # include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
echo "Wheels copied to local directory" echo "Wheels copied to local directory"
# generate source tarball # generate source distribution using setup.py
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" "$BUILDKITE_COMMIT" python setup.py sdist --dist-dir=$DIST_DIR
ls -la $DIST_DIR ls -la $DIST_DIR
SDIST_FILE=$(find $DIST_DIR -name "vllm*.tar.gz")
echo "Found sdist: $SDIST_FILE"
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name) # upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*") PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
if [[ -z "$PYPI_WHEEL_FILES" ]]; then if [[ -z "$PYPI_WHEEL_FILES" ]]; then
@@ -65,6 +68,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then
exit 1 exit 1
fi fi
python3 -m twine check "$PYPI_WHEEL_FILES" python3 -m twine check "$PYPI_WHEEL_FILES" "$SDIST_FILE"
python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES" python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES" "$SDIST_FILE"
echo "Wheels uploaded to PyPI" echo "Wheels and source distribution uploaded to PyPI"

View File

@@ -12,7 +12,7 @@ tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994 protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
aiohttp >= 3.13.3 aiohttp >= 3.13.3
openai >= 1.99.1 # For Responses API with reasoning content openai >= 1.99.1, < 2.25.0 # For Responses API with reasoning content
pydantic >= 2.12.0 pydantic >= 2.12.0
prometheus_client >= 0.18.0 prometheus_client >= 0.18.0
pillow # Required for image processing pillow # Required for image processing

View File

@@ -15,4 +15,4 @@ torch==2.10.0+xpu
torchaudio torchaudio
torchvision torchvision
vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.2/vllm_xpu_kernels-0.1.2-cp312-cp312-linux_x86_64.whl vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.3/vllm_xpu_kernels-0.1.3-cp38-abi3-linux_x86_64.whl

View File

@@ -29,7 +29,8 @@ def test_tokenizer_like_protocol():
_assert_tokenizer_like(tokenizer) _assert_tokenizer_like(tokenizer)
tokenizer = get_tokenizer( tokenizer = get_tokenizer(
"mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral" "mistralai/Mistral-7B-Instruct-v0.3",
tokenizer_mode="mistral",
) )
assert isinstance(tokenizer, MistralTokenizer) assert isinstance(tokenizer, MistralTokenizer)
_assert_tokenizer_like(tokenizer) _assert_tokenizer_like(tokenizer)
@@ -40,11 +41,20 @@ def test_tokenizer_like_protocol():
tokenizer = get_tokenizer("deepseek-ai/DeepSeek-V3", tokenizer_mode="deepseek_v32") tokenizer = get_tokenizer("deepseek-ai/DeepSeek-V3", tokenizer_mode="deepseek_v32")
assert isinstance(tokenizer, HfTokenizer) assert isinstance(tokenizer, HfTokenizer)
# Verify it's a fast tokenizer (required for FastIncrementalDetokenizer) # Verify it's a fast tokenizer (required for FastIncrementalDetokenizer)
assert isinstance(tokenizer, PreTrainedTokenizerFast) assert isinstance(tokenizer, PreTrainedTokenizerFast)
assert "DSV32" in tokenizer.__class__.__name__ assert "DSV32" in tokenizer.__class__.__name__
_assert_tokenizer_like(tokenizer) _assert_tokenizer_like(tokenizer)
tokenizer = get_tokenizer(
"Qwen/Qwen-VL",
tokenizer_mode="qwen_vl",
trust_remote_code=True,
)
assert isinstance(tokenizer, HfTokenizer)
assert "WithoutImagePad" in tokenizer.__class__.__name__
@pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"]) @pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"])
def test_tokenizer_revision(tokenizer_name: str): def test_tokenizer_revision(tokenizer_name: str):

View File

@@ -1321,6 +1321,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
- "slow" will always use the slow tokenizer.\n - "slow" will always use the slow tokenizer.\n
- "mistral" will always use the tokenizer from `mistral_common`.\n - "mistral" will always use the tokenizer from `mistral_common`.\n
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
- "qwen_vl" will always use the tokenizer from `qwen_vl`.\n
- Other custom values can be supported via plugins.""", - Other custom values can be supported via plugins.""",
) )
parser.add_argument("--use-beam-search", action="store_true") parser.add_argument("--use-beam-search", action="store_true")

View File

@@ -126,6 +126,7 @@ class ModelConfig:
- "slow" will always use the slow tokenizer.\n - "slow" will always use the slow tokenizer.\n
- "mistral" will always use the tokenizer from `mistral_common`.\n - "mistral" will always use the tokenizer from `mistral_common`.\n
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
- "qwen_vl" will always use the tokenizer from `qwen_vl`.\n
- Other custom values can be supported via plugins.""" - Other custom values can be supported via plugins."""
trust_remote_code: bool = False trust_remote_code: bool = False
"""Trust remote code (e.g., from HuggingFace) when downloading the model """Trust remote code (e.g., from HuggingFace) when downloading the model

View File

@@ -353,6 +353,39 @@ class Qwen2_5OmniThinkerProcessingInfo(
def get_supported_mm_limits(self) -> Mapping[str, int | None]: def get_supported_mm_limits(self) -> Mapping[str, int | None]:
return {"audio": None, "image": None, "video": None} return {"audio": None, "image": None, "video": None}
def get_mm_max_tokens_per_item(
self,
seq_len: int,
mm_counts: Mapping[str, int] | None = None,
) -> Mapping[str, int] | None:
mm_counts = mm_counts or {}
requested_modalities = {m for m, c in mm_counts.items() if c > 0}
mm_max_tokens: dict[str, int] = {}
if requested_modalities & {"image", "video"}:
vl_tokens = Qwen2_5_VLProcessingInfo.get_mm_max_tokens_per_item(
self,
seq_len=seq_len,
mm_counts=mm_counts,
)
mm_max_tokens.update(
{
m: vl_tokens[m]
for m in ["image", "video"]
if m in requested_modalities
}
)
if "audio" in requested_modalities:
audio_tokens = Qwen2AudioProcessingInfo.get_mm_max_tokens_per_item(
self,
seq_len=seq_len,
mm_counts=mm_counts,
)
mm_max_tokens["audio"] = audio_tokens["audio"]
return mm_max_tokens
class Qwen2_5OmniThinkerDummyInputsBuilder( class Qwen2_5OmniThinkerDummyInputsBuilder(
BaseDummyInputsBuilder[Qwen2_5OmniThinkerProcessingInfo] BaseDummyInputsBuilder[Qwen2_5OmniThinkerProcessingInfo]

View File

@@ -179,6 +179,26 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo):
def get_supported_mm_limits(self) -> Mapping[str, int | None]: def get_supported_mm_limits(self) -> Mapping[str, int | None]:
return {"audio": None} return {"audio": None}
def get_mm_max_tokens_per_item(
self,
seq_len: int,
mm_counts: Mapping[str, int] | None = None,
) -> Mapping[str, int]:
mm_counts = mm_counts or {}
if mm_counts.get("audio", 0) <= 0:
return {}
feature_extractor = self.get_feature_extractor()
chunk_length = min(feature_extractor.chunk_length, 30)
audio_len = int(chunk_length * feature_extractor.sampling_rate)
hop_length = feature_extractor.hop_length
max_mel_seq_len = audio_len // hop_length
input_lengths = torch.tensor([max_mel_seq_len], dtype=torch.long)
_, output_lengths = _get_feat_extract_output_lengths(input_lengths)
return {"audio": int(output_lengths.item())}
class Qwen2AudioDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]): class Qwen2AudioDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]):
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:

View File

@@ -1163,6 +1163,39 @@ class Qwen3OmniMoeThinkerProcessingInfo(
def get_supported_mm_limits(self) -> Mapping[str, int | None]: def get_supported_mm_limits(self) -> Mapping[str, int | None]:
return {"audio": None, "image": None, "video": None} return {"audio": None, "image": None, "video": None}
def get_mm_max_tokens_per_item(
self,
seq_len: int,
mm_counts: Mapping[str, int] | None = None,
) -> Mapping[str, int] | None:
mm_counts = mm_counts or {}
requested_modalities = {m for m, c in mm_counts.items() if c > 0}
mm_max_tokens: dict[str, int] = {}
if requested_modalities & {"image", "video"}:
vl_tokens = Qwen2_5_VLProcessingInfo.get_mm_max_tokens_per_item(
self,
seq_len=seq_len,
mm_counts=mm_counts,
)
mm_max_tokens.update(
{
m: vl_tokens[m]
for m in ["image", "video"]
if m in requested_modalities
}
)
if "audio" in requested_modalities:
audio_tokens = Qwen2AudioProcessingInfo.get_mm_max_tokens_per_item(
self,
seq_len=seq_len,
mm_counts=mm_counts,
)
mm_max_tokens["audio"] = audio_tokens["audio"]
return mm_max_tokens
Qwen3OmniMoeThinkerDummyInputsBuilder = Qwen2_5OmniThinkerDummyInputsBuilder Qwen3OmniMoeThinkerDummyInputsBuilder = Qwen2_5OmniThinkerDummyInputsBuilder

View File

@@ -6,11 +6,9 @@
# Copyright (c) Alibaba Cloud. # Copyright (c) Alibaba Cloud.
"""Inference-only Qwen-VL model compatible with HuggingFace weights.""" """Inference-only Qwen-VL model compatible with HuggingFace weights."""
import copy
import math import math
import unicodedata from collections.abc import Callable, Mapping, Sequence
from collections.abc import Callable, Collection, Mapping, Sequence, Set from functools import partial
from functools import lru_cache, partial
from typing import Annotated, Literal, TypeAlias from typing import Annotated, Literal, TypeAlias
import regex as re import regex as re
@@ -436,60 +434,6 @@ class QwenVLModel(QWenModel):
) )
@lru_cache(maxsize=1)
def _get_tokenizer_without_image_pad(
tokenizer: PreTrainedTokenizer,
) -> PreTrainedTokenizer:
"""
The logic of adding image pad tokens should only be applied in
[`QwenVLProcessor`][vllm.model_executor.models.qwen_vl.QwenVLProcessor],
so they are patched out here.
The definition of the wrapped tokenizer can be found here:
https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
"""
new_tokenizer = copy.deepcopy(tokenizer)
class TokenizerWithoutImagePad(tokenizer.__class__): # type: ignore
def tokenize(
self,
text: str,
allowed_special: Set[str] | str = "all",
disallowed_special: Collection[str] | str = (),
**kwargs,
) -> list[bytes | str]:
text = unicodedata.normalize("NFC", text)
return [
self.decoder[t]
for t in self.tokenizer.encode(
text,
allowed_special=allowed_special,
disallowed_special=disallowed_special,
)
]
def _decode(
self,
token_ids: int | list[int],
skip_special_tokens: bool = False,
errors: str | None = None,
**kwargs,
) -> str:
if isinstance(token_ids, int):
token_ids = [token_ids]
return self.tokenizer.decode(
token_ids,
errors=errors or self.errors,
)
TokenizerWithoutImagePad.__name__ = f"{tokenizer.__class__.__name__}WithoutImagePad"
new_tokenizer.__class__ = TokenizerWithoutImagePad
return new_tokenizer
class QwenVLProcessor: class QwenVLProcessor:
""" """
This model doesn't define its own HF processor, This model doesn't define its own HF processor,
@@ -574,12 +518,6 @@ class QwenVLProcessor:
class QwenVLProcessingInfo(BaseProcessingInfo): class QwenVLProcessingInfo(BaseProcessingInfo):
def get_tokenizer(self) -> PreTrainedTokenizer:
tokenizer = self.ctx.get_tokenizer()
assert isinstance(tokenizer, PreTrainedTokenizer)
return _get_tokenizer_without_image_pad(tokenizer)
def get_hf_processor(self, **kwargs: object) -> QwenVLProcessor: def get_hf_processor(self, **kwargs: object) -> QwenVLProcessor:
return self.ctx.init_processor( return self.ctx.init_processor(
QwenVLProcessor, QwenVLProcessor,

29
vllm/renderers/qwen_vl.py Normal file
View File

@@ -0,0 +1,29 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
from vllm.config import VllmConfig
from vllm.tokenizers import cached_get_tokenizer
from vllm.tokenizers.qwen_vl import QwenVLTokenizer
from .base import BaseRenderer
from .hf import HfRenderer
class QwenVLRenderer(BaseRenderer[QwenVLTokenizer]):
@classmethod
def from_config( # type: ignore[override]
cls,
config: VllmConfig,
tokenizer_kwargs: dict[str, Any],
) -> "HfRenderer":
model_config = config.model_config
if model_config.skip_tokenizer_init:
tokenizer = None
else:
tokenizer = cached_get_tokenizer(
tokenizer_cls=QwenVLTokenizer,
**tokenizer_kwargs,
)
return HfRenderer(config, tokenizer)

View File

@@ -20,6 +20,7 @@ _VLLM_RENDERERS = {
"hf": ("hf", "HfRenderer"), "hf": ("hf", "HfRenderer"),
"grok2": ("grok2", "Grok2Renderer"), "grok2": ("grok2", "Grok2Renderer"),
"mistral": ("mistral", "MistralRenderer"), "mistral": ("mistral", "MistralRenderer"),
"qwen_vl": ("qwen_vl", "QwenVLRenderer"),
"terratorch": ("terratorch", "TerratorchRenderer"), "terratorch": ("terratorch", "TerratorchRenderer"),
} }

View File

@@ -7,9 +7,9 @@ from transformers import AutoTokenizer
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from . import TokenizerLike
from .deepseek_v32_encoding import encode_messages from .deepseek_v32_encoding import encode_messages
from .hf import HfTokenizer, get_cached_tokenizer from .hf import HfTokenizer, get_cached_tokenizer
from .protocol import TokenizerLike
def get_deepseek_v32_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer: def get_deepseek_v32_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer:

View File

@@ -0,0 +1,67 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copy
import unicodedata
from collections.abc import Collection, Set
from transformers import AutoTokenizer
from .hf import HfTokenizer, get_cached_tokenizer
from .protocol import TokenizerLike
def get_qwen_vl_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer:
"""
The logic of adding image pad tokens should only be applied in
`QwenVLProcessor`, so they are patched out here.
The definition of the wrapped tokenizer can be found here:
https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
"""
new_tokenizer = copy.copy(tokenizer)
class TokenizerWithoutImagePad(tokenizer.__class__): # type: ignore
def tokenize(
self,
text: str,
allowed_special: Set[str] | str = "all",
disallowed_special: Collection[str] | str = (),
**kwargs,
) -> list[bytes | str]:
text = unicodedata.normalize("NFC", text)
return [
self.decoder[t]
for t in self.tokenizer.encode(
text,
allowed_special=allowed_special,
disallowed_special=disallowed_special,
)
]
def _decode(
self,
token_ids: int | list[int],
skip_special_tokens: bool = False,
errors: str | None = None,
**kwargs,
) -> str:
if isinstance(token_ids, int):
token_ids = [token_ids]
return self.tokenizer.decode(
token_ids,
errors=errors or self.errors,
)
TokenizerWithoutImagePad.__name__ = f"{tokenizer.__class__.__name__}WithoutImagePad"
new_tokenizer.__class__ = TokenizerWithoutImagePad
return new_tokenizer
class QwenVLTokenizer(TokenizerLike):
@classmethod
def from_pretrained(cls, *args, **kwargs) -> HfTokenizer:
tokenizer = AutoTokenizer.from_pretrained(*args, **kwargs)
return get_cached_tokenizer(get_qwen_vl_tokenizer(tokenizer))

View File

@@ -36,6 +36,7 @@ _VLLM_TOKENIZERS = {
"grok2": ("grok2", "Grok2Tokenizer"), "grok2": ("grok2", "Grok2Tokenizer"),
"hf": ("hf", "CachedHfTokenizer"), "hf": ("hf", "CachedHfTokenizer"),
"mistral": ("mistral", "MistralTokenizer"), "mistral": ("mistral", "MistralTokenizer"),
"qwen_vl": ("qwen_vl", "QwenVLTokenizer"),
} }
@@ -165,6 +166,10 @@ def resolve_tokenizer_args(
): ):
tokenizer_mode = "grok2" tokenizer_mode = "grok2"
# Model-specific tokenizers
if tokenizer_mode == "auto" and "/Qwen-VL" in str(tokenizer_name):
tokenizer_mode = "qwen_vl"
# Fallback to HF tokenizer # Fallback to HF tokenizer
if tokenizer_mode == "auto": if tokenizer_mode == "auto":
tokenizer_mode = "hf" tokenizer_mode = "hf"