[ROCm][CI] Pin TorchCodec to v0.10.0 for ROCm compatibility (#34447 )

Signed-off-by: Andreas Karatzas <akaratza@amd.com> (cherry picked from commit 4c078fa546) (cherry picked from commit a976961fb77d38129abf69edd4952101731f2421)
[ci] Use the right tag for CPU arm64 image (#34915 )
2026-02-24 20:30:22 -08:00 · 2026-02-24 20:30:13 -08:00 · 2026-02-24 20:30:02 -08:00 · 2026-02-24 20:28:53 -08:00 · 2026-02-24 20:28:45 -08:00 · 2026-02-12 02:37:52 -08:00
58 changed files with 432 additions and 236 deletions
--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@@ -3,7 +3,6 @@ steps:
  - label: ":docker: Build image"
    key: image-build
    depends_on: []
-    timeout_in_minutes: 600
    commands:
    - if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
    - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
@@ -42,7 +41,7 @@ steps:
          limit: 2
        - exit_status: -10  # Agent was lost
          limit: 2
-
+  
  - label: ":docker: Build CPU arm64 image"
    key: cpu-arm64-image-build
    depends_on: []
--- a/.buildkite/image_build/image_build_cpu_arm64.sh
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -14,7 +14,7 @@ BUILDKITE_COMMIT=$3
 aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY

 # skip build if image already exists
-if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -24,10 +24,10 @@ fi
 # build
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
-  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
+  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \
  --target vllm-test \
  --progress plain .

 # push
-docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
+docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -248,8 +248,8 @@ steps:
          DOCKER_BUILDKIT: "1"
          DOCKERHUB_USERNAME: "vllmbot"

-  - group: "Publish wheels"
-    key: "publish-wheels"
+  - group: "Publish release artifacts"
+    key: "publish-release-artifacts"
    steps:
      - block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
        key: block-upload-release-wheels
@@ -265,6 +265,27 @@ steps:
          queue: small_cpu_queue_postmerge
        commands:
          - "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
+      
+      - block: "Confirm update release images to DockerHub"
+        key: block-update-release-images-dockerhub
+        depends_on:
+          - input-release-version
+          - annotate-release-workflow
+      
+      - label: "Publish release images to DockerHub"
+        depends_on:
+          - block-update-release-images-dockerhub
+        agents:
+          queue: small_cpu_queue_postmerge
+        commands:
+          - "bash .buildkite/scripts/push-release-images-dockerhub.sh"
+        plugins:
+          - docker-login#v3.0.0:
+              username: vllmbot
+              password-env: DOCKERHUB_TOKEN
+        env:
+          DOCKER_BUILDKIT: "1"
+          DOCKERHUB_USERNAME: "vllmbot"

  # =============================================================================
  # ROCm Release Pipeline (x86_64 only)
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -39,6 +39,7 @@ docker run \
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
    python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
--- a/.buildkite/scripts/push-release-images-dockerhub.sh
+++ b/.buildkite/scripts/push-release-images-dockerhub.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+
+set -ex
+
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null | sed 's/^v//')
+if [ -z "${RELEASE_VERSION}" ]; then
+    echo "RELEASE_VERSION is not set"
+    exit 1
+fi
+
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
+
+# Download images:
+
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
+docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
+docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
+
+# Tag and push images:
+
+## CUDA
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
+docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
+docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+docker push vllm/vllm-openai:latest-x86_64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
+docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
+docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
+docker push vllm/vllm-openai:latest-x86_64-cu130
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
+docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
+docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+docker push vllm/vllm-openai:latest-aarch64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
+docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
+docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+docker push vllm/vllm-openai:latest-aarch64-cu130
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+
+## ROCm
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
+docker push vllm/vllm-openai-rocm:latest
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
+docker push vllm/vllm-openai-rocm:latest-base
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
+
+## CPU
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
+docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
+docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
+docker push vllm/vllm-openai-cpu:latest-x86_64
+docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
+docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
+docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+docker push vllm/vllm-openai-cpu:latest-arm64
+docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+
+# Create multi-arch manifest:
+
+docker manifest rm vllm/vllm-openai:latest
+docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+docker manifest push vllm/vllm-openai:latest
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
+
+docker manifest rm vllm/vllm-openai:latest-cu130
+docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+docker manifest push vllm/vllm-openai:latest-cu130
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
+
+docker manifest rm vllm/vllm-openai-cpu:latest || true
+docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
+docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+docker manifest push vllm/vllm-openai-cpu:latest
+docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,8 +56,8 @@ endif()
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.10.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.10.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1")

 #
 # Try to find python package with an executable that exactly matches
@@ -433,7 +433,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
    endif()

-    if (MARLIN_SM75_ARCHS)
+    if (MARLIN_SM75_ARCHS) 
      file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/marlin/sm75_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
@@ -445,7 +445,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC})
    endif()

-    if (MARLIN_FP8_ARCHS)
+    if (MARLIN_FP8_ARCHS) 
      file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/marlin/sm89_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
@@ -1042,7 +1042,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
    endif()

-    if (MARLIN_MOE_SM75_ARCHS)
+    if (MARLIN_MOE_SM75_ARCHS) 
      file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_MOE_SM75_SRC}"
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -686,6 +686,7 @@ def get_model_params(config):
        "DeepseekV2ForCausalLM",
        "DeepseekV3ForCausalLM",
        "DeepseekV32ForCausalLM",
+        "GlmMoeDsaForCausalLM",
        "Glm4MoeForCausalLM",
        "Glm4MoeLiteForCausalLM",
        "NemotronHForCausalLM",
--- a/cmake/external_projects/triton_kernels.cmake
+++ b/cmake/external_projects/triton_kernels.cmake
@@ -1,9 +1,9 @@
 # Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels

-set(DEFAULT_TRITON_KERNELS_TAG "v3.6.0")
+set(DEFAULT_TRITON_KERNELS_TAG "v3.5.0")

 # Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to
-# be directly set to the triton_kernels python directory.
+# be directly set to the triton_kernels python directory. 
 if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
  message(STATUS "[triton_kernels] Fetch from $ENV{TRITON_KERNELS_SRC_DIR}")
  FetchContent_Declare(
@@ -24,7 +24,7 @@ else()
  )
 endif()

-# Fetch content
+# Fetch content 
 FetchContent_MakeAvailable(triton_kernels)

 if (NOT triton_kernels_SOURCE_DIR)
@@ -47,7 +47,7 @@ install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/third_party/tr
 ## Copy .py files to install directory.
 install(DIRECTORY
        ${TRITON_KERNELS_PYTHON_DIR}
-        DESTINATION
+        DESTINATION 
        vllm/third_party/triton_kernels/
        COMPONENT triton_kernels
        FILES_MATCHING PATTERN "*.py")
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -134,7 +134,6 @@ WORKDIR /vllm-workspace
 # Copy test requirements
 COPY requirements/test.in requirements/cpu-test.in

-# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
 RUN \
    sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
    remove_packages_not_supported_on_aarch64() { \
--- a/docs/getting_started/installation/gpu.xpu.inc.md
+++ b/docs/getting_started/installation/gpu.xpu.inc.md
@@ -6,10 +6,11 @@ vLLM initially supports basic model inference and serving on Intel GPU platform.
 # --8<-- [start:requirements]

 - Supported Hardware: Intel Data Center GPU, Intel ARC GPU
- OneAPI requirements: oneAPI 2025.1
+- OneAPI requirements: oneAPI 2025.3
+- Dependency: [vllm-xpu-kernels](https://github.com/vllm-project/vllm-xpu-kernels): a package provide all necessary vllm custom kernel when running vLLM on Intel GPU platform, 
 - Python: 3.12
 !!! warning
-    The provided IPEX whl is Python3.12 specific so this version is a MUST.
+    The provided vllm-xpu-kernels whl is Python3.12 specific so this version is a MUST.

 # --8<-- [end:requirements]
 # --8<-- [start:set-up-using-python]
@@ -24,7 +25,7 @@ Currently, there are no pre-built XPU wheels.
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]

- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.1 or later.
+- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.3 or later.
 - Second, install Python packages for vLLM XPU backend building:

 ```bash
@@ -37,7 +38,7 @@ pip install -v -r requirements/xpu.txt
 - Then, build and install vLLM XPU backend:

 ```bash
-VLLM_TARGET_DEVICE=xpu python setup.py install
+VLLM_TARGET_DEVICE=xpu pip install --no-build-isolation -e . -v
 ```

 # --8<-- [end:build-wheel-from-source]
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
    "packaging>=24.2",
    "setuptools>=77.0.3,<81.0.0",
    "setuptools-scm>=8.0",
-    "torch == 2.10.0",
+    "torch == 2.9.1",
    "wheel",
    "jinja2",
    "grpcio-tools==1.78.0",
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -4,10 +4,10 @@ ninja
 packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
-torch==2.10.0
+torch==2.9.1
 wheel
 jinja2>=3.1.6
 regex
 build
-protobuf
+protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.*
 grpcio-tools==1.78.0 # Required for grpc entrypoints
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -9,7 +9,7 @@ blake3
 py-cpuinfo
 transformers >= 4.56.0, < 5
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
-protobuf # Required by LlamaTokenizer, gRPC.
+protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp >= 3.13.3
 openai >= 1.99.1  # For Responses API with reasoning content
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -5,9 +5,9 @@ numba == 0.61.2 # Required for N-gram speculative decoding

 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.48.0
-torch==2.10.0
-torchaudio==2.10.0
+torch==2.9.1
+torchaudio==2.9.1
 # These must be updated alongside torch
-torchvision==0.25.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
 flashinfer-python==0.6.3
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -43,5 +43,5 @@ tritonclient>=2.51.0
 numba == 0.61.2 # Required for N-gram speculative decoding
 numpy
 runai-model-streamer[s3,gcs]==0.15.3
-fastsafetensors>=0.1.10
+fastsafetensors>=0.2.2
 pydantic>=2.12 # 2.11 leads to error on python 3.13
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -1,11 +1,12 @@
 # Common dependencies
 -r common.txt

--extra-index-url https://download.pytorch.org/whl/test/rocm7.0
-torch==2.10.0
-torchvision==0.25.0
-torchaudio==2.10.0
-triton==3.6.0
+--extra-index-url https://download.pytorch.org/whl/rocm6.4
+torch==2.9.1
+torchvision==0.24.1
+torchaudio==2.9.1
+
+triton==3.5.1
 cmake>=3.26.1,<4
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -1,6 +1,11 @@
 # Common dependencies
 -r common.txt

+# The version of gRPC libraries should be consistent with each other
+grpcio==1.78.0
+grpcio-reflection==1.78.0
+grpcio-tools==1.78.0
+
 numba == 0.61.2 # Required for N-gram speculative decoding

 # Dependencies for AMD GPUs
@@ -14,5 +19,4 @@ setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
 runai-model-streamer[s3,gcs]==0.15.3
 conch-triton-kernels==1.2.1
-timm>=1.0.17
-grpcio-tools==1.78.0 # Should match `build.txt`
+timm>=1.0.17
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -24,10 +24,10 @@ sentence-transformers>=5.2.0 # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
 tblib # for pickling test exceptions
-timm >=1.0.17 # required for internvl and gemma3n-mm test
-torch==2.10.0
-torchaudio==2.10.0
-torchvision==0.25.0
+timm==1.0.17 # required for internvl and gemma3n-mm test
+torch==2.9.1
+torchaudio==2.9.1
+torchvision==0.24.1
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 mistral_common[image,audio] >= 1.9.0 # required for voxtral test
@@ -48,12 +48,16 @@ buildkite-test-collector==0.1.9
 genai_perf>=0.0.8
 tritonclient>=2.51.0

-grpcio-tools==1.78.0 # Should match `build.txt`
+# The version of gRPC libraries should be consistent with each other
+grpcio==1.78.0
+grpcio-reflection==1.78.0
+grpcio-tools==1.78.0
+
 arctic-inference == 0.1.1 # Required for suffix decoding test
 numba == 0.61.2 # Required for N-gram speculative decoding
 numpy
 runai-model-streamer[s3,gcs]==0.15.3
-fastsafetensors>=0.1.10
+fastsafetensors>=0.2.2 # 0.2.2 contains important fixes for multi-GPU mem usage
 pydantic>=2.12 # 2.11 leads to error on python 3.13
 decord==0.6.0
 terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -155,10 +155,6 @@ coverage==7.10.6
    # via pytest-cov
 cramjam==2.9.0
    # via fastparquet
-cuda-bindings==12.9.4
-    # via torch
-cuda-pathfinder==1.3.3
-    # via cuda-bindings
 cupy-cuda12x==13.6.0
    # via ray
 cycler==0.12.1
@@ -224,7 +220,7 @@ fastparquet==2024.11.0
    # via genai-perf
 fastrlock==0.8.2
    # via cupy-cuda12x
-fastsafetensors==0.1.10
+fastsafetensors==0.2.2
    # via -r requirements/test.in
 filelock==3.16.1
    # via
@@ -309,8 +305,13 @@ greenlet==3.2.3
    # via sqlalchemy
 grpcio==1.78.0
    # via
+    #   -r requirements/test.in
+    #   grpcio-reflection
    #   grpcio-tools
    #   ray
+    #   tensorboard
+grpcio-reflection==1.78.0
+    # via -r requirements/test.in
 grpcio-tools==1.78.0
    # via -r requirements/test.in
 gunicorn==23.0.0
@@ -635,7 +636,7 @@ nvidia-nvjitlink-cu12==12.9.86
    #   nvidia-cusolver-cu12
    #   nvidia-cusparse-cu12
    #   torch
-nvidia-nvshmem-cu12==3.4.5
+nvidia-nvshmem-cu12==3.3.20
    # via torch
 nvidia-nvtx-cu12==12.9.79
    # via torch
@@ -785,6 +786,7 @@ protobuf==6.33.2
    # via
    #   google-api-core
    #   googleapis-common-protos
+    #   grpcio-reflection
    #   grpcio-tools
    #   mlflow-skinny
    #   opentelemetry-proto
@@ -1167,14 +1169,13 @@ tomli==2.2.1
    # via schemathesis
 tomli-w==1.2.0
    # via schemathesis
-torch==2.10.0+cu129
+torch==2.9.1+cu129
    # via
    #   -r requirements/test.in
    #   accelerate
    #   bitsandbytes
    #   efficientnet-pytorch
    #   encodec
-    #   fastsafetensors
    #   kornia
    #   lightly
    #   lightning
@@ -1196,7 +1197,7 @@ torch==2.10.0+cu129
    #   torchvision
    #   vector-quantize-pytorch
    #   vocos
-torchaudio==2.10.0+cu129
+torchaudio==2.9.1+cu129
    # via
    #   -r requirements/test.in
    #   encodec
@@ -1209,7 +1210,7 @@ torchmetrics==1.7.4
    #   pytorch-lightning
    #   terratorch
    #   torchgeo
-torchvision==0.25.0+cu129
+torchvision==0.24.1+cu129
    # via
    #   -r requirements/test.in
    #   lightly
@@ -1251,7 +1252,7 @@ transformers==4.57.5
    #   transformers-stream-generator
 transformers-stream-generator==0.0.5
    # via -r requirements/test.in
-triton==3.6.0
+triton==3.5.1
    # via torch
 tritonclient==2.64.0
    # via -r requirements/test.in
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -15,4 +15,4 @@ torch==2.10.0+xpu
 torchaudio
 torchvision

-vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.1/vllm_xpu_kernels-0.1.1-cp312-cp312-linux_x86_64.whl
+vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.2/vllm_xpu_kernels-0.1.2-cp312-cp312-linux_x86_64.whl
--- a/setup.py
+++ b/setup.py
@@ -1035,7 +1035,7 @@ setup(
    extras_require={
        "bench": ["pandas", "matplotlib", "seaborn", "datasets", "scipy"],
        "tensorizer": ["tensorizer==2.10.1"],
-        "fastsafetensors": ["fastsafetensors >= 0.1.10"],
+        "fastsafetensors": ["fastsafetensors >= 0.2.2"],
        "runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
        "audio": [
            "librosa",
--- a/tests/compile/passes/test_fusion_attn.py
+++ b/tests/compile/passes/test_fusion_attn.py
@@ -267,7 +267,7 @@ elif current_platform.is_rocm():
    PATTERN_TEST_MODELS_FP8 = [
        ("amd/Llama-3.1-8B-Instruct-FP8-KV", TestAttentionFp8StaticQuantPatternModel)
    ]
-    BACKENDS = [
+    BACKENDS_FP8 = [
        AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
        AttentionBackendEnum.ROCM_ATTN,
        AttentionBackendEnum.TRITON_ATTN,
@@ -474,6 +474,17 @@ def test_attention_quant_pattern(
    assert attn_nodes_pre[0].kwargs.get("output_block_scale") is None, (
        "Attention should not have output_block_scale before fusion"
    )
+
+    kv_cache_dummy_dep_pre_is_none = (
+        attn_nodes_pre[0].kwargs.get("kv_cache_dummy_dep") is None
+    )
+    kv_cache_dummy_dep_post_is_none = (
+        attn_nodes_post[0].kwargs.get("kv_cache_dummy_dep") is None
+    )
+    assert not (kv_cache_dummy_dep_pre_is_none ^ kv_cache_dummy_dep_post_is_none), (
+        "The kv_cache_dummy_dep should be consistent before and after fusion"
+    )
+
    if quant_key.dtype == FP8_DTYPE:
        assert attn_nodes_post[0].kwargs.get("output_block_scale") is None, (
            "Attention should not have output_block_scale after FP8 fusion"
--- a/tests/compile/test_aot_compile.py
+++ b/tests/compile/test_aot_compile.py
@@ -90,7 +90,9 @@ def use_vllm_config(vllm_config: VllmConfig):
        yield


-@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
 def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
        vllm_config = make_vllm_config()
@@ -114,7 +116,9 @@ def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch):
            assert torch.allclose(actual, expected)


-@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
 def test_force_aot_load(monkeypatch: pytest.MonkeyPatch):
    with tempfile.TemporaryDirectory() as tmpdirname, monkeypatch.context() as m:
        args = (torch.randn(10, 10),)
@@ -128,7 +132,9 @@ def test_force_aot_load(monkeypatch: pytest.MonkeyPatch):
            CompiledMod(vllm_config=vllm_config)(*args)


-@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
 def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
        args = (torch.randn(10, 10),)
@@ -156,7 +162,9 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
            assert torch.allclose(ret, expected)


-@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
 def test_cache_load_returns_tuple_consistency(monkeypatch: pytest.MonkeyPatch):
    """
    Test that cache loading correctly handles the returns_tuple logic.
@@ -215,7 +223,9 @@ def test_cache_load_returns_tuple_consistency(monkeypatch: pytest.MonkeyPatch):
            )


-@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
 def test_cache_load_returns_tuple_consistency_tuple_output(
    monkeypatch: pytest.MonkeyPatch,
 ):
@@ -284,7 +294,9 @@ def test_cache_load_returns_tuple_consistency_tuple_output(
            )


-@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
 def test_shape_env(monkeypatch: pytest.MonkeyPatch):
    """
    Test that the shape environment is correctly serialized and preserved
@@ -321,7 +333,9 @@ def test_shape_env(monkeypatch: pytest.MonkeyPatch):
                assert guards_string == " - s77 <= 42\n - Eq(Mod(s77, 2), 0)"


-@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
 def test_partition_wrapper_applied_on_aot_load(
    monkeypatch: pytest.MonkeyPatch, vllm_tmp_cache: Path, mocker
 ):
@@ -412,7 +426,9 @@ def test_partition_wrapper_applied_on_aot_load(
        )


-@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
@create_new_process_for_each_test("spawn")
 def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
    """
@@ -476,7 +492,9 @@ def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
        symbolic_shapes_module.make_symbol = original_make_symbol


-@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
 class TestStandaloneCompiledArtifacts:
    def test_init(self):
        cache = StandaloneCompiledArtifacts()
@@ -650,7 +668,9 @@ class TestStandaloneCompiledArtifacts:
        assert len(restored_cache.loaded_submodule_store) == 0


-@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
 class TestStandaloneCompiledArtifactsIntegration:
    def test_add_pickle_unpickle(self):
        cache = StandaloneCompiledArtifacts()
--- a/tests/compile/test_dynamic_shapes_compilation.py
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@@ -39,7 +39,9 @@ def get_test_models():
@pytest.mark.parametrize("use_aot_compile", ["0", "1"])
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
@pytest.mark.parametrize("evaluate_guards", [False, True])
-@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
 def test_dynamic_shapes_compilation(
    monkeypatch,
    model_name,
--- a/tests/entrypoints/openai/test_realtime_validation.py
+++ b/tests/entrypoints/openai/test_realtime_validation.py
@@ -129,5 +129,5 @@ async def test_multi_chunk_streaming(
                " First words I spoke in the original phonograph."
                " A little piece of practical poetry. Mary had a little lamb,"
                " it sleeps with quite a flow, and everywhere that Mary went,"
-                " the lamb was sure to go"
+                " the lamb was sure to go."
            )
--- a/tests/kernels/moe/test_shared_fused_moe_routed_transform.py
+++ b/tests/kernels/moe/test_shared_fused_moe_routed_transform.py
@@ -14,7 +14,6 @@ import torch.nn as nn
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
-from vllm.utils.torch_utils import is_torch_equal_or_newer


 class SimpleLinear(nn.Module):
@@ -61,10 +60,6 @@ def setup_cuda():
@pytest.mark.parametrize("num_tokens", [1, 32])
@pytest.mark.parametrize("hidden_size,latent_size", [(256, 128), (128, 64)])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
-@pytest.mark.skipif(
-    is_torch_equal_or_newer("2.10.0"),
-    reason="Test fails with PyTorch 2.10.0 see: https://github.com/vllm-project/vllm/issues/33995",
-)
 def test_routed_input_transform_inside_vs_outside(
    num_tokens: int,
    hidden_size: int,
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -275,6 +275,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
        "zai-org/GLM-4.7-Flash",
        min_transformers_version="5.0.0",
    ),
+    "GlmMoeDsaForCausalLM": _HfExamplesInfo(
+        "zai-org/GLM-5", min_transformers_version="5.0.1", is_available_online=False
+    ),
    "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", {"alias": "gpt2"}),
    "GPTBigCodeForCausalLM": _HfExamplesInfo(
        "bigcode/starcoder",
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -97,7 +97,7 @@ def can_initialize(
            "pickle error when loading `transformers.models.auto.CONFIG_MAPPING`"
        )

-    if model_arch == "DeepseekV32ForCausalLM":
+    if model_arch in ["DeepseekV32ForCausalLM", "GlmMoeDsaForCausalLM"]:
        from vllm.platforms import current_platform

        capability = current_platform.get_device_capability()
--- a/tests/quantization/test_cpu_wna16.py
+++ b/tests/quantization/test_cpu_wna16.py
@@ -17,7 +17,7 @@ DTYPE = ["bfloat16"]

@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", DTYPE)
-def test_ipex_quant(vllm_runner, model, dtype):
+def test_cpu_quant(vllm_runner, model, dtype):
    with vllm_runner(model, dtype=dtype) as llm:
        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
    assert output
--- a/tests/quantization/test_ipex_quant.py
+++ b/tests/quantization/test_ipex_quant.py
@@ -1,32 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Test model set-up and inference for quantized HF models supported
-on the CPU/GPU backend using IPEX (including AWQ/GPTQ).
-
-Validating the configuration and printing results for manual checking.
-
-Run `pytest tests/quantization/test_ipex_quant.py`.
-"""
-
-import pytest
-
-from vllm.platforms import current_platform
-
-MODELS = [
-    "AMead10/Llama-3.2-1B-Instruct-AWQ",
-    "shuyuej/Llama-3.2-1B-Instruct-GPTQ",  # with g_idx
-]
-DTYPE = ["bfloat16"]
-
-
-@pytest.mark.skipif(
-    not current_platform.is_cpu() and not current_platform.is_xpu(),
-    reason="only supports Intel CPU/XPU backend.",
-)
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", DTYPE)
-def test_ipex_quant(vllm_runner, model, dtype):
-    with vllm_runner(model, dtype=dtype, enforce_eager=True) as llm:
-        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
-    assert output
-    print(output)
--- a/tests/v1/core/test_async_scheduler.py
+++ b/tests/v1/core/test_async_scheduler.py
@@ -236,7 +236,7 @@ def test_prefix_caching_for_multi_turn():
        req._all_token_ids = req.prompt_token_ids.copy()
        req.all_token_ids = ConstantList(req._all_token_ids)
        req.block_hashes = []
-        req.block_hashes = req.get_hash_new_full_blocks()
+        req.update_block_hashes()

    # Schedule the next-turn requests.
    for req in next_turn_requests:
--- a/tools/install_torchcodec_rocm.sh
+++ b/tools/install_torchcodec_rocm.sh
@@ -7,7 +7,8 @@
 set -e

 TORCHCODEC_REPO="${TORCHCODEC_REPO:-https://github.com/pytorch/torchcodec.git}"
-TORCHCODEC_BRANCH="${TORCHCODEC_BRANCH:-main}"
+# Pin to a specific release for reproducibility; update as needed.
+TORCHCODEC_BRANCH="${TORCHCODEC_BRANCH:-v0.10.0}"

 echo "=== TorchCodec Installation Script ==="

--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -53,7 +53,7 @@ if hasattr(torch.ops._xpu_C, "int4_gemm_w4a16"):
        return torch.empty((M, N), dtype=input.dtype, device=input.device)


-class ipex_ops:
+class xpu_ops:
    @staticmethod
    def flash_attn_varlen_func(
        q: torch.Tensor,
@@ -73,7 +73,7 @@ class ipex_ops:
        cu_seqlens_k: torch.Tensor | None = None,
        # passed in qwen vl
        dropout_p: float = 0.0,
-        # The following parameters are not used in ipex kernel currently,
+        # The following parameters are not used in xpu kernel currently,
        # we keep API compatible to CUDA's.
        scheduler_metadata=None,
        fa_version: int = 2,
@@ -153,6 +153,6 @@ class ipex_ops:
        sm_margin=0,  # Can be tuned if some SMs are used for communication
    ) -> None:
        logger.warning_once(
-            "get_scheduler_metadata is not implemented for ipex_ops, returning None."
+            "get_scheduler_metadata is not implemented for xpu_ops, returning None."
        )
        return None
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -233,7 +233,7 @@ class InductorStandaloneAdaptor(CompilerInterface):

        from torch._inductor import standalone_compile

-        supports_aot = is_torch_equal_or_newer("2.10.0")
+        supports_aot = is_torch_equal_or_newer("2.10.0.dev")

        if not supports_aot and envs.VLLM_USE_MEGA_AOT_ARTIFACT:
            logger.error(
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -333,7 +333,7 @@ def _support_torch_compile(
    ) -> None:
        def mark_dynamic(arg: torch.Tensor, dims: list[int]) -> None:
            if ds_type == DynamicShapesType.UNBACKED:
-                if is_torch_equal_or_newer("2.10.0"):
+                if is_torch_equal_or_newer("2.10.0.dev"):
                    for dim in dims:
                        torch._dynamo.decorators.mark_unbacked(
                            arg, dim, hint_override=arg.size()[dim]
@@ -373,7 +373,7 @@ def _support_torch_compile(
                    if isinstance(arg, torch.Tensor):
                        # In case dims is specified with negative indexing
                        dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
-                        if is_torch_equal_or_newer("2.10.0"):
+                        if is_torch_equal_or_newer("2.10.0.dev"):
                            for dim in dims:
                                torch._dynamo.decorators.mark_unbacked(
                                    arg, dim, hint_override=arg.size()[dim]
@@ -525,9 +525,9 @@ def _support_torch_compile(
            fx_config_patches["backed_size_oblivious"] = True

        # Prepare inductor config patches
-        # assume_32bit_indexing is only available in torch 2.10.0+
+        # assume_32bit_indexing is only available in torch 2.10.0.dev+
        inductor_config_patches = {}
-        if is_torch_equal_or_newer("2.10.0"):
+        if is_torch_equal_or_newer("2.10.0.dev"):
            inductor_config_patches["assume_32bit_indexing"] = (
                self.compilation_config.dynamic_shapes_config.assume_32_bit_indexing
            )
--- a/vllm/compilation/passes/fusion/attn_quant_fusion.py
+++ b/vllm/compilation/passes/fusion/attn_quant_fusion.py
@@ -142,6 +142,7 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern):
            v: torch.Tensor,
            output_attn: torch.Tensor,
            scale: torch.Tensor,
+            kv_cache_dummy_dep: torch.Tensor,
        ) -> torch.Tensor:
            at1 = auto_functionalized(
                ATTN_OP,
@@ -152,6 +153,7 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern):
                layer_name=self.layer_name,
                output_scale=None,
                output_block_scale=None,
+                kv_cache_dummy_dep=kv_cache_dummy_dep,
            )
            attn_out_view = RESHAPE_OP(
                at1[1], [q.shape[0], self.num_heads * self.head_size]
@@ -165,6 +167,7 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern):
            v: torch.Tensor,
            output_attn: torch.Tensor,
            scale: torch.Tensor,
+            kv_cache_dummy_dep: torch.Tensor,
        ) -> torch.Tensor:
            # attn output in quant_dtype
            output_attn = torch.ops.aten.full.default(
@@ -182,6 +185,7 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern):
                layer_name=self.layer_name,
                output_scale=scale,
                output_block_scale=None,
+                kv_cache_dummy_dep=kv_cache_dummy_dep,
            )
            return RESHAPE_OP(at1[1], [-1, self.num_heads * self.head_size])

@@ -191,6 +195,7 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern):
            self.empty(5, self.num_heads, self.head_size),  # v
            self.empty(5, self.num_heads, self.head_size),  # attn_output
            empty_fp32(1, 1),  # scale
+            self.empty(0),  # kv_cache_dummy_dep
        ]

        pm.register_replacement(
@@ -228,6 +233,7 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern):
            output_quant: torch.Tensor,
            output_scale: torch.Tensor,
            input_scale: torch.Tensor,
+            kv_cache_dummy_dep: torch.Tensor,
        ) -> tuple[torch.Tensor, torch.Tensor]:
            at1 = auto_functionalized(
                ATTN_OP,
@@ -238,6 +244,7 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern):
                layer_name=self.layer_name,
                output_scale=None,
                output_block_scale=None,
+                kv_cache_dummy_dep=kv_cache_dummy_dep,
            )
            attn_out_view = RESHAPE_OP(
                at1[1], [q.shape[0], self.num_heads * self.head_size]
@@ -261,6 +268,7 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern):
            output_quant: torch.Tensor,
            output_scale: torch.Tensor,
            input_scale: torch.Tensor,
+            kv_cache_dummy_dep: torch.Tensor,
        ) -> tuple[torch.Tensor, torch.Tensor]:
            # attention output in quant_dtype
            output_attn = torch.ops.aten.full.default(
@@ -280,6 +288,7 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern):
                layer_name=self.layer_name,
                output_scale=input_scale,
                output_block_scale=output_scale_view,
+                kv_cache_dummy_dep=kv_cache_dummy_dep,
            )
            output = RESHAPE_OP(at2[1], [-1, self.num_heads * self.head_size // 2])
            return output, at2[2]
@@ -294,6 +303,7 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern):
                128, round_up(self.num_heads * self.head_size // 16, 4)
            ),  # output_scale
            empty_fp32(1, 1),  # input_scale
+            self.empty(0),  # kv_cache_dummy_dep
        ]

        pm.register_replacement(
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -181,7 +181,7 @@ class SpeculativeConfig:
    @staticmethod
    def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
        initial_architecture = hf_config.architectures[0]
-        if hf_config.model_type in ("deepseek_v3", "deepseek_v32"):
+        if hf_config.model_type in ("deepseek_v3", "deepseek_v32", "glm_moe_dsa"):
            hf_config.model_type = "deepseek_mtp"
        if hf_config.model_type == "deepseek_mtp":
            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
--- a/vllm/entrypoints/openai/realtime/connection.py
+++ b/vllm/entrypoints/openai/realtime/connection.py
@@ -48,7 +48,6 @@ class RealtimeConnection:
        self.generation_task: asyncio.Task | None = None

        self._is_connected = False
-        self._is_input_finished = False
        self._is_model_validated = False

        self._max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB
@@ -145,7 +144,7 @@ class RealtimeConnection:
            commit_event = InputAudioBufferCommit(**event)
            # final signals that the audio is finished
            if commit_event.final:
-                self._is_input_finished = True
+                self.audio_queue.put_nowait(None)
            else:
                await self.start_generation()
        else:
@@ -239,11 +238,6 @@ class RealtimeConnection:
                    # finish because websocket connection was killed
                    break

-                if self.audio_queue.empty() and self._is_input_finished:
-                    # finish because client signals that audio input
-                    # is finished
-                    break
-
            usage = UsageInfo(
                prompt_tokens=prompt_token_ids_len,
                completion_tokens=completion_tokens_len,
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -271,7 +271,7 @@ def use_aot_compile() -> bool:

    default_value = (
        "1"
-        if is_torch_equal_or_newer("2.11.0.dev") and not disable_compile_cache()
+        if is_torch_equal_or_newer("2.10.0.dev") and not disable_compile_cache()
        else "0"
    )

--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -974,7 +974,7 @@ def enable_batch_invariant_mode():
    )

    reduced_precision_val = (
-        (False, False) if is_torch_equal_or_newer("2.10.0") else False
+        (False, False) if is_torch_equal_or_newer("2.10.0.dev") else False
    )
    torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = (
        reduced_precision_val
--- a/vllm/model_executor/layers/fused_moe/init.py
+++ b/vllm/model_executor/layers/fused_moe/init.py
@@ -102,6 +102,7 @@ if HAS_TRITON:
    )
    from vllm.model_executor.layers.fused_moe.xpu_fused_moe import (
        XPUExperts,
+        XPUExpertsFp8,
    )

    __all__ += [
@@ -121,6 +122,7 @@ if HAS_TRITON:
        "BatchedDeepGemmExperts",
        "TritonOrDeepGemmExperts",
        "XPUExperts",
+        "XPUExpertsFp8",
    ]
 else:
    # Some model classes directly use the custom ops. Add placeholders
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -27,21 +27,9 @@ logger = init_logger(__name__)
 if has_triton_kernels():
    try:
        import triton_kernels.swiglu
-        from triton_kernels.matmul_ogs import (
-            FnSpecs,
-            FusedActivation,
-            GatherIndx,
-            RoutingData,
-            ScatterIndx,
-            matmul_ogs,
-        )
-        from triton_kernels.tensor import (
-            BIT,
-            Bitmatrix,
-            SparseMatrix,
-            make_ragged_tensor_metadata,
-        )
-        from triton_kernels.topk import topk
+        from triton_kernels.matmul_ogs import FnSpecs, FusedActivation, matmul_ogs
+        from triton_kernels.routing import RoutingData, routing, routing_from_bitmatrix
+        from triton_kernels.tensor import Bitmatrix
    except (AttributeError, ImportError) as e:
        logger.error(
            "Failed to import Triton kernels. Please make sure your triton "
@@ -90,58 +78,6 @@ def pack_bitmatrix(
        tl.store(bitmatrix_ptrs, y, mask=offsets_m[:, None] < n_rows)


-def legacy_routing_from_bitmatrix(
-    bitmatrix: "Bitmatrix",
-    expt_scal: torch.Tensor,
-    expt_indx: torch.Tensor,
-    n_expts_tot: int,
-    n_expts_act: int,
-) -> tuple["RoutingData", "GatherIndx", "ScatterIndx"]:
-    """
-    Replacement for the removed triton_kernels.routing.routing_from_bitmatrix.
-    Creates routing data from a bitmatrix representation.
-    """
-    sparse_logits = SparseMatrix(indx=expt_indx, vals=expt_scal, mask=bitmatrix)
-    dispatch_indx = sparse_logits.mask_metadata.row_sorted_indx
-    combine_indx = sparse_logits.mask_metadata.col_sorted_indx
-    ragged_batch_metadata = make_ragged_tensor_metadata(
-        sparse_logits.mask_metadata.col_sum,
-        dispatch_indx.shape[0],
-    )
-    gate_scal = sparse_logits.vals.flatten()[combine_indx]
-    routing_data = RoutingData(
-        gate_scal,
-        ragged_batch_metadata.block_sizes,
-        n_expts_tot,
-        n_expts_act,
-        ragged_batch_metadata,
-    )
-    gather_idx = GatherIndx(combine_indx, dispatch_indx)
-    scatter_idx = ScatterIndx(dispatch_indx, combine_indx)
-    return routing_data, gather_idx, scatter_idx
-
-
-def legacy_routing(
-    logits: torch.Tensor,
-    n_expts_act: int,
-    sm_first: bool = False,
-) -> tuple["RoutingData", "GatherIndx", "ScatterIndx"]:
-    """
-    Replacement for the removed triton_kernels.routing.routing function.
-    Computes routing data from gating logits.
-    """
-    if sm_first:
-        logits = torch.softmax(logits, dim=-1)
-    sparse_logits = topk(logits, n_expts_act, apply_softmax=not sm_first)
-    return legacy_routing_from_bitmatrix(
-        sparse_logits.mask,
-        sparse_logits.vals,
-        sparse_logits.indx,
-        logits.shape[-1],
-        n_expts_act,
-    )
-
-
 def triton_kernel_moe_forward(
    hidden_states: torch.Tensor,
    w1,  # Tensor or triton_kernels.Tensor
@@ -155,7 +91,7 @@ def triton_kernel_moe_forward(
    global_num_experts: int = -1,
    expert_map: torch.Tensor | None = None,
 ) -> torch.Tensor:
-    routing_data, gather_idx, scatter_idx = legacy_routing(
+    routing_data, gather_idx, scatter_idx = routing(
        gating_output, topk, sm_first=not renormalize
    )

@@ -232,10 +168,9 @@ def triton_kernel_fused_experts(
    output_tensor = _resize_cache(output_tensor, (batch_dim, M, K))

    act = FusedActivation(
-        FnSpecs(
-            "swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit"), reduction_n=2
-        ),
+        FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")),
        (swiglu_alpha, swiglu_limit),
+        2,
    )
    gammas = routing_data.gate_scal if routing_data else None

@@ -297,12 +232,12 @@ def make_routing_data(
    bitmatrix_shape = [n_rows, bm_cols * 32]
    bitmatrix_shape_max = [n_rows, None]
    bitmatrix = Bitmatrix(
-        bitmatrix, dtype=BIT, shape=bitmatrix_shape, shape_max=bitmatrix_shape_max
+        bitmatrix, shape=bitmatrix_shape, shape_max=bitmatrix_shape_max, scratchpad=None
    )

    # matmul_ogs expects invalid topk_weights to be -1s
    topk_weights = torch.where(topk_ids == -1, -1.0, topk_weights)
-    routing_data, gather_indx, scatter_indx = legacy_routing_from_bitmatrix(
+    routing_data, gather_indx, scatter_indx = routing_from_bitmatrix(
        bitmatrix, topk_weights, topk_ids, num_local_experts, num_topk
    )

--- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py
@@ -52,6 +52,7 @@ class Fp8MoeBackend(Enum):
    AITER = "AITER"
    VLLM_CUTLASS = "VLLM_CUTLASS"
    BATCHED_VLLM_CUTLASS = "BATCHED_VLLM_CUTLASS"
+    XPU = "XPU"


 def backend_to_kernel_cls(
@@ -123,6 +124,13 @@ def backend_to_kernel_cls(

        return CutlassBatchedExpertsFp8

+    elif backend == Fp8MoeBackend.XPU:
+        from vllm.model_executor.layers.fused_moe.xpu_fused_moe import (
+            XPUExpertsFp8,
+        )
+
+        return XPUExpertsFp8
+
    else:
        raise ValueError(f"Unknown FP8 MoE backend: {backend.value}")

@@ -154,6 +162,7 @@ def select_fp8_moe_backend(
        Fp8MoeBackend.TRITON,
        Fp8MoeBackend.BATCHED_TRITON,
        Fp8MoeBackend.MARLIN,
+        Fp8MoeBackend.XPU,
    ]

    # NOTE(rob): We need to peak into the P/F selection to determine
@@ -393,6 +402,7 @@ def convert_to_fp8_moe_kernel_format(
            Fp8MoeBackend.BATCHED_TRITON,
            Fp8MoeBackend.VLLM_CUTLASS,
            Fp8MoeBackend.BATCHED_VLLM_CUTLASS,
+            Fp8MoeBackend.XPU,
        ]:
            raise ValueError(f"Unsupported FP8 MoE backend: {fp8_backend.value}")

--- a/vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
@@ -4,13 +4,16 @@ import torch

 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
    FusedMoEParallelConfig,
+    FusedMoEQuantConfig,
 )
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
    TopKWeightAndReduceNoOP,
 )
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    QuantKey,
+    kFp8DynamicTensorSym,
    kFp8StaticTensorSym,
 )
 from vllm.platforms import current_platform
@@ -20,6 +23,21 @@ if current_platform.is_xpu():


 class XPUExperts(mk.FusedMoEPermuteExpertsUnpermute):
+    def __init__(
+        self,
+        moe_config: FusedMoEConfig,
+        quant_config: FusedMoEQuantConfig,
+        max_num_tokens: int | None = None,
+        num_dispatchers: int | None = None,
+    ):
+        super().__init__(
+            moe_config,
+            quant_config,
+            max_num_tokens,
+            num_dispatchers,
+        )
+        self.is_fp8 = False
+
    @property
    def expects_unquantized_inputs(self) -> bool:
        return True
@@ -49,10 +67,10 @@ class XPUExperts(mk.FusedMoEPermuteExpertsUnpermute):
        weight_key: QuantKey | None,
        activation_key: QuantKey | None,
    ) -> bool:
-        # TODO: dispatch based on device.
        SUPPORTED_W_A = [
            (None, None),
            (kFp8StaticTensorSym, None),
+            (kFp8StaticTensorSym, kFp8DynamicTensorSym),
        ]
        return (weight_key, activation_key) in SUPPORTED_W_A

@@ -103,10 +121,10 @@ class XPUExperts(mk.FusedMoEPermuteExpertsUnpermute):
        xpu_fused_moe(
            hidden_states=hidden_states,
            w13=w1,
-            w13_scales=a1q_scale,
+            w13_scales=self.w1_scale,
            w13_bias=self.w1_bias,
            w2=w2,
-            w2_scales=a2_scale,
+            w2_scales=self.w2_scale,
            w2_bias=self.w2_bias,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
@@ -116,5 +134,22 @@ class XPUExperts(mk.FusedMoEPermuteExpertsUnpermute):
            ep_rank=self.moe_config.ep_rank,
            ep_size=self.moe_config.ep_size,
            output=output,
+            is_fp8=self.is_fp8,
        )
-        return
+
+
+class XPUExpertsFp8(XPUExperts):
+    def __init__(
+        self,
+        moe_config: FusedMoEConfig,
+        quant_config: FusedMoEQuantConfig,
+        max_num_tokens: int | None = None,
+        num_dispatchers: int | None = None,
+    ):
+        super().__init__(
+            moe_config,
+            quant_config,
+            max_num_tokens,
+            num_dispatchers,
+        )
+        self.is_fp8 = True
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -180,18 +180,9 @@ class Fp8Config(QuantizationConfig):
            weight_block_size=weight_block_size,
        )

-    def get_xpu_quant_method(
-        self, layer: torch.nn.Module, prefix: str
-    ) -> "QuantizeMethodBase | None":
-        raise NotImplementedError(
-            "FP8 quantization is not supported during xpu kernel migration."
-        )
-
    def get_quant_method(
        self, layer: torch.nn.Module, prefix: str
    ) -> "QuantizeMethodBase | None":
-        if current_platform.is_xpu():
-            return self.get_xpu_quant_method(layer, prefix)
        if isinstance(layer, LinearBase):
            if is_layer_skipped(
                prefix=prefix,
@@ -300,7 +291,7 @@ class Fp8LinearMethod(LinearMethodBase):
            or envs.VLLM_TEST_FORCE_FP8_MARLIN
        )
        # Disable marlin for rocm
-        if current_platform.is_rocm():
+        if current_platform.is_rocm() or current_platform.is_xpu():
            self.use_marlin = False
        if vllm_is_batch_invariant():
            self.use_marlin = False
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/init.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/init.py
@@ -39,6 +39,9 @@ from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKer
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import (
    TritonInt8ScaledMMLinearKernel,
 )
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.xpu import (
+    XPUFP8ScaledMMLinearKernel,
+)
 from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey
 from vllm.platforms import PlatformEnum, current_platform

@@ -72,6 +75,9 @@ _POSSIBLE_FP8_KERNELS: dict[PlatformEnum, list[type[FP8ScaledMMLinearKernel]]] =
        PerTensorTorchFP8ScaledMMLinearKernel,
        ChannelWiseTorchFP8ScaledMMLinearKernel,
    ],
+    PlatformEnum.XPU: [
+        XPUFP8ScaledMMLinearKernel,
+    ],
 }

 _KernelT = TypeVar("_KernelT", bound=ScaledMMLinearKernel)
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xpu.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xpu.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+
+import torch
+
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (  # noqa: E501
+    FP8ScaledMMLinearKernel,
+    FP8ScaledMMLinearLayerConfig,
+)
+from vllm.platforms import current_platform
+
+
+class XPUFP8ScaledMMLinearKernel(FP8ScaledMMLinearKernel):
+    @classmethod
+    def is_supported(
+        cls, compute_capability: int | None = None
+    ) -> tuple[bool, str | None]:
+        if not current_platform.is_xpu():
+            return False, "XPUFP8ScaledMM only support on XPU"
+        return True, None
+
+    @classmethod
+    def can_implement(cls, c: FP8ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
+        if c.weight_quant_key.dtype not in {torch.float8_e5m2, torch.float8_e4m3fn}:
+            return False, "XPUFP8ScaledMM only support FP8 weight dtype"
+        return True, None
+
+    def __init__(
+        self, c: FP8ScaledMMLinearLayerConfig, layer_param_names: Sequence[str]
+    ) -> None:
+        assert self.can_implement(c)[0]
+        assert self.is_supported()[0]
+        self.config = c
+        self.layer_param_names = layer_param_names
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        weight = layer.weight
+        weight_scale = layer.weight_scale
+        return torch.ops._xpu_C.fp8_gemm_w8a16(x, weight, weight_scale, bias)
+
+    def apply_scaled_mm(
+        self,
+        *,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        out_dtype: torch.dtype,
+        As: torch.Tensor,
+        Bs: torch.Tensor,
+        bias: torch.Tensor | None,
+        output_shape: list,
+    ) -> torch.Tensor:
+        pass
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -160,7 +160,7 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
            logger.info_once("Using Triton backend")
            return Mxfp4Backend.TRITON
    elif current_platform.is_xpu():
-        logger.info_once("Using ipex marlin backend on XPU")
+        logger.info_once("Using xpu backend on XPU")
        return Mxfp4Backend.MARLIN
    elif current_platform.is_rocm() and has_triton_kernels():
        logger.info_once("Using Triton backend")
--- a/vllm/model_executor/layers/sparse_attn_indexer.py
+++ b/vllm/model_executor/layers/sparse_attn_indexer.py
@@ -20,7 +20,7 @@ from vllm.v1.worker.workspace import current_workspace_manager
 if current_platform.is_cuda_alike():
    from vllm import _custom_ops as ops
 elif current_platform.is_xpu():
-    from vllm._ipex_ops import ipex_ops as ops
+    from vllm._xpu_ops import xpu_ops as ops

 logger = init_logger(__name__)

--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -836,7 +836,7 @@ class DeepseekV2MLAAttention(nn.Module):
                qk_rope_head_dim,
                max_position=max_position_embeddings,
                rope_parameters=config.rope_parameters,
-                is_neox_style=True,
+                is_neox_style=not getattr(config, "indexer_rope_interleave", False),
            )
            self.indexer = Indexer(
                vllm_config,
@@ -1499,6 +1499,10 @@ class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM):
    pass


+class GlmMoeDsaForCausalLM(DeepseekV2ForCausalLM):
+    pass
+
+
 # Compatibility with
 # https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/configuration_deepseek.py
 def get_spec_layer_idx_from_weight_name(
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -114,6 +114,7 @@ _TEXT_GENERATION_MODELS = {
    "Glm4ForCausalLM": ("glm4", "Glm4ForCausalLM"),
    "Glm4MoeForCausalLM": ("glm4_moe", "Glm4MoeForCausalLM"),
    "Glm4MoeLiteForCausalLM": ("glm4_moe_lite", "Glm4MoeLiteForCausalLM"),
+    "GlmMoeDsaForCausalLM": ("deepseek_v2", "GlmMoeDsaForCausalLM"),
    "GptOssForCausalLM": ("gpt_oss", "GptOssForCausalLM"),
    "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
    "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -338,7 +338,6 @@ class CpuPlatform(Platform):
                ld_preload_str += pytorch_libgomp_so
                os.environ["LD_PRELOAD"] = ld_preload_str

-        # To hint IPEX uses shared memory based AllReduce
        os.environ["LOCAL_WORLD_SIZE"] = str(
            vllm_config.parallel_config.tensor_parallel_size
        )
--- a/vllm/transformers_utils/model_arch_config_convertor.py
+++ b/vllm/transformers_utils/model_arch_config_convertor.py
@@ -237,6 +237,7 @@ class ModelArchConfigConvertorBase:
            "deepseek_v3",
            "deepseek_v32",
            "deepseek_mtp",
+            "glm_moe_dsa",
            "glm4_moe_lite",
            "glm4_moe_lite_mtp",
            "kimi_k2",
--- a/vllm/v1/attention/backends/fa_utils.py
+++ b/vllm/v1/attention/backends/fa_utils.py
@@ -23,12 +23,11 @@ if current_platform.is_cuda():

 elif current_platform.is_xpu():
    from vllm import _custom_ops as ops
+    from vllm._xpu_ops import xpu_ops

    reshape_and_cache_flash = ops.reshape_and_cache_flash
-    from vllm._ipex_ops import ipex_ops
-
-    flash_attn_varlen_func = ipex_ops.flash_attn_varlen_func  # type: ignore[assignment]
-    get_scheduler_metadata = ipex_ops.get_scheduler_metadata  # type: ignore[assignment]
+    flash_attn_varlen_func = xpu_ops.flash_attn_varlen_func  # type: ignore[assignment]
+    get_scheduler_metadata = xpu_ops.get_scheduler_metadata  # type: ignore[assignment]
 elif current_platform.is_rocm():
    try:
        from flash_attn import flash_attn_varlen_func  # type: ignore[no-redef]
@@ -153,7 +152,7 @@ def is_flash_attn_varlen_func_available() -> bool:

    Platform-specific sources:
    - CUDA: vllm.vllm_flash_attn.flash_attn_varlen_func
-    - XPU: ipex_ops.flash_attn_varlen_func
+    - XPU: xpu_ops.flash_attn_varlen_func
    - ROCm: upstream flash_attn.flash_attn_varlen_func (if available)

    Note: This is separate from the AITER flash attention backend (rocm_aiter_fa.py)
--- a/vllm/v1/attention/ops/paged_attn.py
+++ b/vllm/v1/attention/ops/paged_attn.py
@@ -9,7 +9,7 @@ from vllm.platforms import current_platform
 if current_platform.is_cuda_alike():
    from vllm import _custom_ops as ops
 elif current_platform.is_xpu():
-    from vllm._ipex_ops import ipex_ops as ops  # type: ignore[no-redef]
+    from vllm._xpu_ops import xpu_ops as ops  # type: ignore[no-redef]


 class PagedAttention:
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -982,10 +982,8 @@ class Scheduler(SchedulerInterface):

        session._all_token_ids.extend(update.prompt_token_ids or ())
        session.prompt_token_ids.extend(update.prompt_token_ids or ())
-        # Update block hashes for the new tokens
-        # (mirrors Request.append_output_token_ids)
-        if session.get_hash_new_full_blocks is not None:
-            session.block_hashes.extend(session.get_hash_new_full_blocks())
+        # Update block hashes for the new tokens.
+        session.update_block_hashes()
        session.num_prompt_tokens = len(session.prompt_token_ids)
        session.arrival_time = update.arrival_time
        session.sampling_params = update.sampling_params
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -6,7 +6,6 @@ import time
 from collections import deque
 from collections.abc import Callable, Mapping
 from dataclasses import dataclass
-from functools import partial
 from typing import TYPE_CHECKING, Any

 import torch
@@ -164,10 +163,11 @@ class Request:
        self.num_external_computed_tokens = 0

        self.block_hashes: list[BlockHash] = []
-        self.get_hash_new_full_blocks: Callable[[], list[BlockHash]] | None = None
-        if block_hasher is not None:
-            self.get_hash_new_full_blocks = partial(block_hasher, self)
-            self.block_hashes = self.get_hash_new_full_blocks()
+        # Store the block hasher without binding self to avoid creating a
+        # reference cycle (Request -> partial -> Request) that prevents
+        # immediate garbage collection via reference counting.
+        self._block_hasher: Callable[[Request], list[BlockHash]] | None = block_hasher
+        self.update_block_hashes()

        self.skip_reading_prefix_cache = self.get_skip_reading_prefix_cache()

@@ -212,8 +212,12 @@ class Request:
            self._output_token_ids.extend(token_ids)
            self._all_token_ids.extend(token_ids)

-        if self.get_hash_new_full_blocks is not None:
-            self.block_hashes.extend(self.get_hash_new_full_blocks())
+        self.update_block_hashes()
+
+    def update_block_hashes(self) -> None:
+        """Compute block hashes for any new full blocks and append them."""
+        if self._block_hasher is not None:
+            self.block_hashes.extend(self._block_hasher(self))

    @property
    def use_structured_output(self) -> bool:
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -1503,6 +1503,24 @@ class SpecDecodeBaseProposer:
                del self.model.lm_head
            self.model.lm_head = target_language_model.lm_head

+            # MTP models call compute_logits via shared_head.head (a
+            # ParallelLMHead inside each MTP layer), not self.model.lm_head.
+            # If the checkpoint omits a copy of the lm_head weights at the
+            # MTP layer path, shared_head.head stays uninitialised and
+            # produces NaN logits. Always share it explicitly.
+            inner = getattr(self.model, "model", None)
+            layers = getattr(inner, "layers", None) if inner else None
+            if layers is not None:
+                items = layers.values() if isinstance(layers, nn.ModuleDict) else layers
+                for layer in items:
+                    sh = getattr(layer, "shared_head", None)
+                    if sh is not None and hasattr(sh, "head"):
+                        del sh.head
+                        sh.head = target_language_model.lm_head
+                        logger.info(
+                            "Shared target model lm_head with MTP shared_head.head."
+                        )
+
    @torch.inference_mode()
    def dummy_run(
        self,
Author	SHA1	Message	Date
Andreas Karatzas	89a77b1084	[ROCm][CI] Pin TorchCodec to v0.10.0 for ROCm compatibility (#34447 ) Signed-off-by: Andreas Karatzas <akaratza@amd.com> (cherry picked from commit `4c078fa546`) (cherry picked from commit a976961fb77d38129abf69edd4952101731f2421)	2026-02-24 20:30:22 -08:00
Kevin H. Luu	d3c1513f5f	[ci] Use the right tag for CPU arm64 image (#34915 ) Signed-off-by: Kevin H. Luu <khluu000@gmail.com> (cherry picked from commit `a1a2d79442`) (cherry picked from commit 772f70839192262ff01c533d821a11a225d1c00f)	2026-02-24 20:30:13 -08:00
Cyrus Leung	5dbfbc967b	[CI/Build] Fix gRPC version mismatch (#35013 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> (cherry picked from commit `965fe45935`) (cherry picked from commit 90308959295b66049024649fe1273070477f343d)	2026-02-24 20:30:02 -08:00
khluu	c86cdcbcd2	Revert "[Release 2.10] Update to Torch 2.10 - final release (#30525 )" This reverts commit `f97ca67176`.	2026-02-24 20:28:53 -08:00
khluu	3c9496f146	Revert "[Bugfix][ROCm][GPT-OSS] Use old triton_kernels implementation on ROCm if the new API is not available (#34153 )" This reverts commit `55a1baebc5`.	2026-02-24 20:28:45 -08:00
khluu	2d5be1dd5c	release script Signed-off-by: khluu <khluu000@gmail.com>	2026-02-12 02:37:52 -08:00
Michael Goin	7a06e5b05b	[Bugfix] Fix MTP accuracy for GLM-5 (#34385 ) Signed-off-by: mgoin <mgoin64@gmail.com> (cherry picked from commit `ec12d39d44`)	2026-02-11 20:54:27 -08:00
Junseo Park	946b2f106c	[Bugfix] send None sentinel on final commit so server properly sends transcription.done (#33963 ) Signed-off-by: pjs102793 <pjs102793@naver.com> Co-authored-by: Nick Hill <nickhill123@gmail.com> (cherry picked from commit `5458eb835d`)	2026-02-11 20:54:14 -08:00
Nick Hill	5e8adb0c49	[Misc] Bump `fastsafetensors` version for latest fixes (#34273 ) Signed-off-by: Nick Hill <nickhill123@gmail.com> (cherry picked from commit `79504027ef`)	2026-02-11 20:54:00 -08:00
Xinyu Dong	9be1ff2d3a	[Bugfix] fix default is_neox_style is True for deepseek (#34353 ) Signed-off-by: dongxinyu03 <dongxinyu03@baidu.com> (cherry picked from commit `be7f3d5d20`)	2026-02-11 20:53:40 -08:00
Jee Jee Li	b3ee90f961	[Model] GLM adaptation (#34124 ) (cherry picked from commit `978a37c823`)	2026-02-11 20:53:11 -08:00
Seiji Eicher	c44d0c6d66	Patch protobuf for CVE-2026-0994 (#34253 ) Signed-off-by: Seiji Eicher <seiji@anyscale.com> Co-authored-by: Kevin H. Luu <khluu000@gmail.com> (cherry picked from commit `5045d5c983`)	2026-02-11 02:33:40 -08:00
Kunshang Ji	83db96d8cd	[XPU][9/N] clean up existing ipex code/doc (#34111 ) Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> (cherry picked from commit `cb9574eb85`)	2026-02-11 02:33:27 -08:00
zofia	dbfb79fe45	[XPU][7/N] enable xpu fp8 moe (#34202 ) Signed-off-by: Zhu, Zufang <zufang.zhu@intel.com> (cherry picked from commit `b482f71e9f`)	2026-02-11 02:33:15 -08:00
Roger Wang	b2e1fc3589	[Bugfix][Core] Fix CPU memory leak from Request reference cycle in prefix caching (#34183 ) Signed-off-by: Roger Wang <hey@rogerw.io> (cherry picked from commit `8a5e0e2b2b`)	2026-02-11 02:33:04 -08:00
Gregory Shtrasberg	55a1baebc5	[Bugfix][ROCm][GPT-OSS] Use old triton_kernels implementation on ROCm if the new API is not available (#34153 ) Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> (cherry picked from commit `c60f8e3b49`)	2026-02-11 02:32:52 -08:00
Charlie Fu	e1e9841631	[torch.compile][Fusion] Fix attention fusion pass removing kv_udpate op. (#33945 ) Signed-off-by: charlifu <charlifu@amd.com> (cherry picked from commit `bb9f97308d`)	2026-02-11 02:32:41 -08:00
zofia	5bd63387c3	[XPU][6/N] add xpu scaled_mm kernel (#34117 ) Signed-off-by: Zhu, Zufang <zufang.zhu@intel.com> (cherry picked from commit `9bdb06b436`)	2026-02-11 02:32:27 -08:00