Bound openai to under 2.25.0

Signed-off-by: khluu <khluu000@gmail.com>
[Release] Include source distribution (sdist) in PyPI uploads (#35136 )
2026-03-06 13:04:15 -08:00 · 2026-03-06 13:03:53 -08:00 · 2026-03-06 13:03:40 -08:00 · 2026-03-06 13:03:26 -08:00 · 2026-03-06 13:03:05 -08:00
16 changed files with 215 additions and 74 deletions
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -44,7 +44,7 @@ docker run \
    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
    cd tests
-    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
+    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
--- a/.buildkite/scripts/upload-release-wheels-pypi.sh
+++ b/.buildkite/scripts/upload-release-wheels-pypi.sh
@@ -54,10 +54,13 @@ mkdir -p $DIST_DIR
 # include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
 aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
 echo "Wheels copied to local directory"
-# generate source tarball
+# generate source distribution using setup.py
-git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" "$BUILDKITE_COMMIT"
+python setup.py sdist --dist-dir=$DIST_DIR
 ls -la $DIST_DIR
 SDIST_FILE=$(find $DIST_DIR -name "vllm*.tar.gz")
 echo "Found sdist: $SDIST_FILE"
 # upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
 PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
 if [[ -z "$PYPI_WHEEL_FILES" ]]; then
@@ -65,6 +68,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then
  exit 1
 fi
-python3 -m twine check "$PYPI_WHEEL_FILES"
+python3 -m twine check "$PYPI_WHEEL_FILES" "$SDIST_FILE"
-python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES"
+python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES" "$SDIST_FILE"
-echo "Wheels uploaded to PyPI"
+echo "Wheels and source distribution uploaded to PyPI"
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -12,7 +12,7 @@ tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp >= 3.13.3
-openai >= 1.99.1  # For Responses API with reasoning content
+openai >= 1.99.1, < 2.25.0  # For Responses API with reasoning content
 pydantic >= 2.12.0
 prometheus_client >= 0.18.0
 pillow  # Required for image processing
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -15,4 +15,4 @@ torch==2.10.0+xpu
 torchaudio
 torchvision
-vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.2/vllm_xpu_kernels-0.1.2-cp312-cp312-linux_x86_64.whl
+vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.3/vllm_xpu_kernels-0.1.3-cp38-abi3-linux_x86_64.whl
--- a/tests/tokenizers_/test_basic.py
+++ b/tests/tokenizers_/test_basic.py
@@ -29,7 +29,8 @@ def test_tokenizer_like_protocol():
    _assert_tokenizer_like(tokenizer)
    tokenizer = get_tokenizer(
-        "mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
+        "mistralai/Mistral-7B-Instruct-v0.3",
        tokenizer_mode="mistral",
    )
    assert isinstance(tokenizer, MistralTokenizer)
    _assert_tokenizer_like(tokenizer)
@@ -40,11 +41,20 @@ def test_tokenizer_like_protocol():
    tokenizer = get_tokenizer("deepseek-ai/DeepSeek-V3", tokenizer_mode="deepseek_v32")
    assert isinstance(tokenizer, HfTokenizer)
    # Verify it's a fast tokenizer (required for FastIncrementalDetokenizer)
    assert isinstance(tokenizer, PreTrainedTokenizerFast)
    assert "DSV32" in tokenizer.__class__.__name__
    _assert_tokenizer_like(tokenizer)
    tokenizer = get_tokenizer(
        "Qwen/Qwen-VL",
        tokenizer_mode="qwen_vl",
        trust_remote_code=True,
    )
    assert isinstance(tokenizer, HfTokenizer)
    assert "WithoutImagePad" in tokenizer.__class__.__name__
@pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"])
 def test_tokenizer_revision(tokenizer_name: str):
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -1321,6 +1321,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
        - "slow" will always use the slow tokenizer.\n
        - "mistral" will always use the tokenizer from `mistral_common`.\n
        - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
        - "qwen_vl" will always use the tokenizer from `qwen_vl`.\n
        - Other custom values can be supported via plugins.""",
    )
    parser.add_argument("--use-beam-search", action="store_true")
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -126,6 +126,7 @@ class ModelConfig:
    - "slow" will always use the slow tokenizer.\n
    - "mistral" will always use the tokenizer from `mistral_common`.\n
    - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
    - "qwen_vl" will always use the tokenizer from `qwen_vl`.\n
    - Other custom values can be supported via plugins."""
    trust_remote_code: bool = False
    """Trust remote code (e.g., from HuggingFace) when downloading the model
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -353,6 +353,39 @@ class Qwen2_5OmniThinkerProcessingInfo(
    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
        return {"audio": None, "image": None, "video": None}
    def get_mm_max_tokens_per_item(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int] | None = None,
    ) -> Mapping[str, int] | None:
        mm_counts = mm_counts or {}
        requested_modalities = {m for m, c in mm_counts.items() if c > 0}
        mm_max_tokens: dict[str, int] = {}
        if requested_modalities & {"image", "video"}:
            vl_tokens = Qwen2_5_VLProcessingInfo.get_mm_max_tokens_per_item(
                self,
                seq_len=seq_len,
                mm_counts=mm_counts,
            )
            mm_max_tokens.update(
                {
                    m: vl_tokens[m]
                    for m in ["image", "video"]
                    if m in requested_modalities
                }
            )
        if "audio" in requested_modalities:
            audio_tokens = Qwen2AudioProcessingInfo.get_mm_max_tokens_per_item(
                self,
                seq_len=seq_len,
                mm_counts=mm_counts,
            )
            mm_max_tokens["audio"] = audio_tokens["audio"]
        return mm_max_tokens
 class Qwen2_5OmniThinkerDummyInputsBuilder(
    BaseDummyInputsBuilder[Qwen2_5OmniThinkerProcessingInfo]
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -179,6 +179,26 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
        return {"audio": None}
    def get_mm_max_tokens_per_item(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int] | None = None,
    ) -> Mapping[str, int]:
        mm_counts = mm_counts or {}
        if mm_counts.get("audio", 0) <= 0:
            return {}
        feature_extractor = self.get_feature_extractor()
        chunk_length = min(feature_extractor.chunk_length, 30)
        audio_len = int(chunk_length * feature_extractor.sampling_rate)
        hop_length = feature_extractor.hop_length
        max_mel_seq_len = audio_len // hop_length
        input_lengths = torch.tensor([max_mel_seq_len], dtype=torch.long)
        _, output_lengths = _get_feat_extract_output_lengths(input_lengths)
        return {"audio": int(output_lengths.item())}
 class Qwen2AudioDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]):
    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -1163,6 +1163,39 @@ class Qwen3OmniMoeThinkerProcessingInfo(
    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
        return {"audio": None, "image": None, "video": None}
    def get_mm_max_tokens_per_item(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int] | None = None,
    ) -> Mapping[str, int] | None:
        mm_counts = mm_counts or {}
        requested_modalities = {m for m, c in mm_counts.items() if c > 0}
        mm_max_tokens: dict[str, int] = {}
        if requested_modalities & {"image", "video"}:
            vl_tokens = Qwen2_5_VLProcessingInfo.get_mm_max_tokens_per_item(
                self,
                seq_len=seq_len,
                mm_counts=mm_counts,
            )
            mm_max_tokens.update(
                {
                    m: vl_tokens[m]
                    for m in ["image", "video"]
                    if m in requested_modalities
                }
            )
        if "audio" in requested_modalities:
            audio_tokens = Qwen2AudioProcessingInfo.get_mm_max_tokens_per_item(
                self,
                seq_len=seq_len,
                mm_counts=mm_counts,
            )
            mm_max_tokens["audio"] = audio_tokens["audio"]
        return mm_max_tokens
 Qwen3OmniMoeThinkerDummyInputsBuilder = Qwen2_5OmniThinkerDummyInputsBuilder
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -6,11 +6,9 @@
 # Copyright (c) Alibaba Cloud.
 """Inference-only Qwen-VL model compatible with HuggingFace weights."""
 import copy
 import math
-import unicodedata
+from collections.abc import Callable, Mapping, Sequence
-from collections.abc import Callable, Collection, Mapping, Sequence, Set
+from functools import partial
 from functools import lru_cache, partial
 from typing import Annotated, Literal, TypeAlias
 import regex as re
@@ -436,60 +434,6 @@ class QwenVLModel(QWenModel):
        )
@lru_cache(maxsize=1)
 def _get_tokenizer_without_image_pad(
    tokenizer: PreTrainedTokenizer,
 ) -> PreTrainedTokenizer:
    """
    The logic of adding image pad tokens should only be applied in
    [`QwenVLProcessor`][vllm.model_executor.models.qwen_vl.QwenVLProcessor],
    so they are patched out here.
    The definition of the wrapped tokenizer can be found here:
    https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
    """
    new_tokenizer = copy.deepcopy(tokenizer)
    class TokenizerWithoutImagePad(tokenizer.__class__):  # type: ignore
        def tokenize(
            self,
            text: str,
            allowed_special: Set[str] | str = "all",
            disallowed_special: Collection[str] | str = (),
            **kwargs,
        ) -> list[bytes | str]:
            text = unicodedata.normalize("NFC", text)
            return [
                self.decoder[t]
                for t in self.tokenizer.encode(
                    text,
                    allowed_special=allowed_special,
                    disallowed_special=disallowed_special,
                )
            ]
        def _decode(
            self,
            token_ids: int | list[int],
            skip_special_tokens: bool = False,
            errors: str | None = None,
            **kwargs,
        ) -> str:
            if isinstance(token_ids, int):
                token_ids = [token_ids]
            return self.tokenizer.decode(
                token_ids,
                errors=errors or self.errors,
            )
    TokenizerWithoutImagePad.__name__ = f"{tokenizer.__class__.__name__}WithoutImagePad"
    new_tokenizer.__class__ = TokenizerWithoutImagePad
    return new_tokenizer
 class QwenVLProcessor:
    """
    This model doesn't define its own HF processor,
@@ -574,12 +518,6 @@ class QwenVLProcessor:
 class QwenVLProcessingInfo(BaseProcessingInfo):
    def get_tokenizer(self) -> PreTrainedTokenizer:
        tokenizer = self.ctx.get_tokenizer()
        assert isinstance(tokenizer, PreTrainedTokenizer)
        return _get_tokenizer_without_image_pad(tokenizer)
    def get_hf_processor(self, **kwargs: object) -> QwenVLProcessor:
        return self.ctx.init_processor(
            QwenVLProcessor,
--- a/vllm/renderers/qwen_vl.py
+++ b/vllm/renderers/qwen_vl.py
@@ -0,0 +1,29 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any
 from vllm.config import VllmConfig
 from vllm.tokenizers import cached_get_tokenizer
 from vllm.tokenizers.qwen_vl import QwenVLTokenizer
 from .base import BaseRenderer
 from .hf import HfRenderer
 class QwenVLRenderer(BaseRenderer[QwenVLTokenizer]):
    @classmethod
    def from_config(  # type: ignore[override]
        cls,
        config: VllmConfig,
        tokenizer_kwargs: dict[str, Any],
    ) -> "HfRenderer":
        model_config = config.model_config
        if model_config.skip_tokenizer_init:
            tokenizer = None
        else:
            tokenizer = cached_get_tokenizer(
                tokenizer_cls=QwenVLTokenizer,
                **tokenizer_kwargs,
            )
        return HfRenderer(config, tokenizer)
--- a/vllm/renderers/registry.py
+++ b/vllm/renderers/registry.py
@@ -20,6 +20,7 @@ _VLLM_RENDERERS = {
    "hf": ("hf", "HfRenderer"),
    "grok2": ("grok2", "Grok2Renderer"),
    "mistral": ("mistral", "MistralRenderer"),
    "qwen_vl": ("qwen_vl", "QwenVLRenderer"),
    "terratorch": ("terratorch", "TerratorchRenderer"),
 }
--- a/vllm/tokenizers/deepseek_v32.py
+++ b/vllm/tokenizers/deepseek_v32.py
@@ -7,9 +7,9 @@ from transformers import AutoTokenizer
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 from . import TokenizerLike
 from .deepseek_v32_encoding import encode_messages
 from .hf import HfTokenizer, get_cached_tokenizer
 from .protocol import TokenizerLike
 def get_deepseek_v32_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer:
--- a/vllm/tokenizers/qwen_vl.py
+++ b/vllm/tokenizers/qwen_vl.py
@@ -0,0 +1,67 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 import unicodedata
 from collections.abc import Collection, Set
 from transformers import AutoTokenizer
 from .hf import HfTokenizer, get_cached_tokenizer
 from .protocol import TokenizerLike
 def get_qwen_vl_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer:
    """
    The logic of adding image pad tokens should only be applied in
    `QwenVLProcessor`, so they are patched out here.
    The definition of the wrapped tokenizer can be found here:
    https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
    """
    new_tokenizer = copy.copy(tokenizer)
    class TokenizerWithoutImagePad(tokenizer.__class__):  # type: ignore
        def tokenize(
            self,
            text: str,
            allowed_special: Set[str] | str = "all",
            disallowed_special: Collection[str] | str = (),
            **kwargs,
        ) -> list[bytes | str]:
            text = unicodedata.normalize("NFC", text)
            return [
                self.decoder[t]
                for t in self.tokenizer.encode(
                    text,
                    allowed_special=allowed_special,
                    disallowed_special=disallowed_special,
                )
            ]
        def _decode(
            self,
            token_ids: int | list[int],
            skip_special_tokens: bool = False,
            errors: str | None = None,
            **kwargs,
        ) -> str:
            if isinstance(token_ids, int):
                token_ids = [token_ids]
            return self.tokenizer.decode(
                token_ids,
                errors=errors or self.errors,
            )
    TokenizerWithoutImagePad.__name__ = f"{tokenizer.__class__.__name__}WithoutImagePad"
    new_tokenizer.__class__ = TokenizerWithoutImagePad
    return new_tokenizer
 class QwenVLTokenizer(TokenizerLike):
    @classmethod
    def from_pretrained(cls, *args, **kwargs) -> HfTokenizer:
        tokenizer = AutoTokenizer.from_pretrained(*args, **kwargs)
        return get_cached_tokenizer(get_qwen_vl_tokenizer(tokenizer))
--- a/vllm/tokenizers/registry.py
+++ b/vllm/tokenizers/registry.py
@@ -36,6 +36,7 @@ _VLLM_TOKENIZERS = {
    "grok2": ("grok2", "Grok2Tokenizer"),
    "hf": ("hf", "CachedHfTokenizer"),
    "mistral": ("mistral", "MistralTokenizer"),
    "qwen_vl": ("qwen_vl", "QwenVLTokenizer"),
 }
@@ -165,6 +166,10 @@ def resolve_tokenizer_args(
    ):
        tokenizer_mode = "grok2"
    # Model-specific tokenizers
    if tokenizer_mode == "auto" and "/Qwen-VL" in str(tokenizer_name):
        tokenizer_mode = "qwen_vl"
    # Fallback to HF tokenizer
    if tokenizer_mode == "auto":
        tokenizer_mode = "hf"
Author	SHA1	Message	Date
khluu	b31e9326a7	Bound openai to under 2.25.0 Signed-off-by: khluu <khluu000@gmail.com>	2026-03-06 13:04:15 -08:00
Doug Smith	e346c08560	[Release] Include source distribution (sdist) in PyPI uploads (#35136 ) Signed-off-by: dougbtv <dosmith@redhat.com> Co-authored-by: Daniele Trifirò <dtrifiro@redhat.com> (cherry picked from commit `0bfa229bf1`)	2026-03-06 13:03:53 -08:00
Avery Miao	b7a423cb01	[BUGFIX]Fix Qwen-Omni models audio max_token_per_item estimation error leading to encoder_cache_size is 0 (#35994 ) Signed-off-by: Miao, Avery <avery.miao@intel.com> (cherry picked from commit `e998fa76b9`)	2026-03-06 13:03:40 -08:00
Cyrus Leung	fa78ec8a72	[Bugfix] Fix Qwen-VL tokenizer implementation (#36140 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> (cherry picked from commit `7196348157`)	2026-03-06 13:03:26 -08:00
Kunshang Ji	9a474ce7a4	[XPU] bump vllm-xpu-kernels to v0.1.3 (#35984 ) Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> (cherry picked from commit `a8f66cbde8`)	2026-03-06 13:03:05 -08:00