Revert "[V0 deprecation] Remove V0 CPU/XPU/TPU backends (#20412 )"

This reverts commit e202dd2736.
[Bugfix] Add use_cross_encoder flag to use correct activation in ClassifierPooler (#20527 )
2025-07-06 14:02:36 -07:00 · 2025-07-06 14:01:48 -07:00 · 2025-07-06 19:44:52 +00:00 · 2025-07-06 09:54:36 -07:00 · 2025-07-06 08:48:13 -07:00 · 2025-07-06 10:25:29 +00:00
175 changed files with 3968 additions and 1372 deletions
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -52,7 +52,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
  - label: "Annotate release workflow"
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -107,10 +107,9 @@ fi
 if [[ $commands == *" kernels/attention"* ]]; then
  commands="${commands} \
-  --ignore=kernels/attention/stest_attention_selector.py \
+  --ignore=kernels/attention/test_attention_selector.py \
  --ignore=kernels/attention/test_blocksparse_attention.py \
  --ignore=kernels/attention/test_encoder_decoder_attn.py \
  --ignore=kernels/attention/test_attention_selector.py \
  --ignore=kernels/attention/test_flash_attn.py \
  --ignore=kernels/attention/test_flashinfer.py \
  --ignore=kernels/attention/test_prefix_prefill.py \
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -16,7 +16,7 @@
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm
 /vllm/entrypoints @aarnphm
-CMakeLists.txt @tlrmchlsmth
+CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -68,7 +68,7 @@ jobs:
          export AWS_ACCESS_KEY_ID=minioadmin
          export AWS_SECRET_ACCESS_KEY=minioadmin
          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
-          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
+          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set image.env[2].name=VLLM_CPU_CI_ENV --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string image.env[2].value="1" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
      - name: curl test
        run: |
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -259,7 +259,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -615,6 +615,26 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                     "in CUDA target architectures.")
    endif()
  endif()
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
    message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
  else()
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
      message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
                     "if you intend on running FP8 quantized MoE models on Blackwell.")
    else()
      message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
                     "in CUDA target architectures")
    endif()
  endif()
  #
  # Machete kernels
--- a/benchmarks/kernels/bench_fp8_gemm.py
+++ b/benchmarks/kernels/bench_fp8_gemm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import copy
 import itertools
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -620,7 +620,7 @@ def main(args: argparse.Namespace):
            4096,
        ]
    else:
-        batch_sizes = [args.batch_size]
+        batch_sizes = args.batch_size
    use_deep_gemm = bool(args.use_deep_gemm)
@@ -728,7 +728,7 @@ if __name__ == "__main__":
    )
    parser.add_argument("--use-deep-gemm", action="store_true")
    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--batch-size", type=int, required=False)
+    parser.add_argument("--batch-size", type=int, nargs="+", required=False)
    parser.add_argument("--tune", action="store_true")
    parser.add_argument("--trust-remote-code", action="store_true")
    parser.add_argument("--model-prefix", type=str, required=False)
--- a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -45,7 +45,6 @@
 #include "cute/algorithm/functional.hpp"
 #include "cute/atom/mma_atom.hpp"
 #include "cute/algorithm/gemm.hpp"
 #include "cute/tensor_predicate.hpp"
 #include "cute/numeric/arithmetic_tuple.hpp"
 #include "cutlass_extensions/gemm/dispatch_policy.hpp"
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@@ -162,10 +162,11 @@ __global__ void dynamic_scaled_int8_quant_kernel(
  // calculate for absmax
  float thread_max = 0.f;
-  for (int i = tid; i < hidden_size; i += stride) {
+  vectorize_read_with_alignment<16>(
-    const auto v = fabsf(static_cast<float>(row_in[i]));
+      row_in, hidden_size, tid, stride, [&] __device__(const scalar_t& src) {
-    thread_max = fmaxf(thread_max, v);
+        const float v = fabsf(static_cast<float>(src));
-  }
+        thread_max = fmaxf(thread_max, v);
      });
  using BlockReduce = cub::BlockReduce<float, 256>;
  __shared__ typename BlockReduce::TempStorage tmp;
  float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
@@ -232,9 +233,10 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
  // 1. calculate min & max
  MinMax thread_mm;
-  for (int i = tid; i < hidden_size; i += stride) {
+  vectorize_read_with_alignment<16>(row_in, hidden_size, tid, stride,
-    thread_mm += static_cast<float>(row_in[i]);
+                                    [&] __device__(const scalar_t& src) {
-  }
+                                      thread_mm += static_cast<float>(src);
                                    });
  using BlockReduce = cub::BlockReduce<MinMax, 256>;
  __shared__ typename BlockReduce::TempStorage tmp;
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
@@ -51,7 +51,8 @@ struct cutlass_3x_gemm {
  // These are the minimum alignments needed for the kernels to compile
  static constexpr int AlignmentAB =
      128 / cutlass::sizeof_bits<ElementAB>::value;
-  static constexpr int AlignmentCD = 4;
+  static constexpr int AlignmentCD =
      128 / cutlass::sizeof_bits<ElementD>::value;
  using CollectiveEpilogue =
      typename cutlass::epilogue::collective::CollectiveBuilder<
--- a/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
@@ -0,0 +1,374 @@
 #include "core/registration.h"
 #include <torch/all.h>
 #include <cutlass/arch/arch.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
 #include "cute/tensor.hpp"
 #include "cutlass/tensor_ref.h"
 #include "cutlass/epilogue/collective/default_epilogue.hpp"
 #include "cutlass/epilogue/thread/linear_combination.h"
 #include "cutlass/gemm/dispatch_policy.hpp"
 #include "cutlass/gemm/group_array_problem_shape.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/device/gemm_universal_adapter.h"
 #include "cutlass/gemm/kernel/gemm_universal.hpp"
 #include "cutlass/util/command_line.h"
 #include "cutlass/util/distribution.h"
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/util/packed_stride.hpp"
 #include "cutlass/util/tensor_view_io.h"
 #include "cutlass/util/reference/device/gemm.h"
 #include "cutlass/util/reference/device/tensor_compare.h"
 #include "cutlass/util/reference/host/tensor_fill.h"
 #include "cutlass/util/reference/host/gett.hpp"
 #include "cutlass/util/reference/host/tensor_norm.h"
 #include "cutlass/util/reference/host/tensor_compare.h"
 #include <cassert>
 using namespace cute;
 template <typename ElementAB, typename ElementC, typename ElementAccumulator,
          typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
 __global__ void get_ggemm_starts(
    int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
    ElementC** out_offsets, ElementAccumulator** a_scale_offsets,
    ElementAccumulator** b_scale_offsets, ElementAB* a_base_as_int,
    ElementAB* b_base_as_int, ElementC* out_base_as_int,
    ElementAccumulator* a_scale_base_as_int,
    ElementAccumulator* b_scale_base_as_int, LayoutSFA* layout_sfa_base_as_int,
    LayoutSFB* layout_sfb_base_as_int, int* problem_sizes) {
  int expert_id = threadIdx.x;
  if (expert_id >= gridDim.x * blockDim.x) {
    return;
  }
  int m = problem_sizes[expert_id * 3];
  int n = problem_sizes[expert_id * 3 + 1];
  int k = problem_sizes[expert_id * 3 + 2];
  int32_t expert_offset = expert_offsets[expert_id];
  int a_stride = expert_offset * k;
  int b_stride = expert_id * k * n;
  int a_scale_stride = expert_offset * k / 128;
  int b_scale_stride = expert_id * k * n / 128 / 128;
  a_offsets[expert_id] = a_base_as_int + a_stride;
  b_offsets[expert_id] = b_base_as_int + b_stride;
  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
  a_scale_offsets[expert_id] = a_scale_base_as_int + a_scale_stride;
  b_scale_offsets[expert_id] = b_scale_base_as_int + b_scale_stride;
  LayoutSFA* layout_sfa_ptr = layout_sfa_base_as_int + expert_id;
  LayoutSFB* layout_sfb_ptr = layout_sfb_base_as_int + expert_id;
  *layout_sfa_ptr =
      ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(m, n, k, 1));
  *layout_sfb_ptr =
      ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(m, n, k, 1));
 }
 #define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE, LayoutSFA, LayoutSFB, \
                                 ScaleConfig)                                 \
  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                            \
    get_ggemm_starts<cutlass::float_e4m3_t, C_TYPE, float, LayoutSFA,         \
                     LayoutSFB, ScaleConfig><<<1, num_experts, 0, stream>>>(  \
        static_cast<int32_t*>(expert_offsets.data_ptr()),                     \
        static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),              \
        static_cast<cutlass::float_e4m3_t**>(b_ptrs.data_ptr()),              \
        static_cast<C_TYPE**>(out_ptrs.data_ptr()),                           \
        static_cast<float**>(a_scales_ptrs.data_ptr()),                       \
        static_cast<float**>(b_scales_ptrs.data_ptr()),                       \
        static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()),            \
        static_cast<cutlass::float_e4m3_t*>(b_tensors.data_ptr()),            \
        static_cast<C_TYPE*>(out_tensors.data_ptr()),                         \
        static_cast<float*>(a_scales.data_ptr()),                             \
        static_cast<float*>(b_scales.data_ptr()),                             \
        reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),                  \
        reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr()),                  \
        static_cast<int*>(problem_sizes.data_ptr()));                         \
  }
 template <typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
 void run_get_ggemm_starts(
    torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
    torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
    torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
    torch::Tensor const& a_tensors, torch::Tensor const& b_tensors,
    torch::Tensor out_tensors, torch::Tensor const& a_scales,
    torch::Tensor const& b_scales, torch::Tensor const& layout_sfa,
    torch::Tensor const& layout_sfb, torch::Tensor const& problem_sizes) {
  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
  TORCH_CHECK(out_tensors.size(1) % 128 == 0 or out_tensors.size(0) % 128 == 0);
  TORCH_CHECK(a_tensors.size(1) % 128 == 0 or a_tensors.size(0) % 128 == 0);
  int num_experts = (int)expert_offsets.size(0);
  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
  if (false) {
  }
  __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t, LayoutSFA,
                           LayoutSFB, ScaleConfig)
  __CALL_GET_STARTS_KERNEL(torch::kFloat16, cutlass::half_t, LayoutSFA,
                           LayoutSFB, ScaleConfig)
  else {
    TORCH_CHECK(false, "Unsupported output tensor type");
  }
 }
 template <typename OutType, typename ScheduleConfig, typename LayoutD>
 void run_blockwise_scaled_group_mm(
    torch::Tensor& out_ptrs, const torch::Tensor& a_ptrs,
    const torch::Tensor& b_ptrs, const torch::Tensor& a_scales_ptrs,
    const torch::Tensor& b_scales_ptrs, const torch::Tensor& stride_a,
    const torch::Tensor& stride_b, const torch::Tensor& stride_c,
    const torch::Tensor& layout_sfa, const torch::Tensor& layout_sfb,
    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
  using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;
  // Types
  using ElementA = cutlass::float_e4m3_t;
  using ElementB = cutlass::float_e4m3_t;
  using ElementC = OutType;
  using ElementD = ElementC;
  using ElementAccumulator = float;
  using LayoutA = cutlass::layout::RowMajor;
  using LayoutB = cutlass::layout::ColumnMajor;
  using LayoutC = LayoutD;
  // Alignments
  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
  using ArchTag = cutlass::arch::Sm100;
  using OperatorClass = cutlass::arch::OpClassTensorOp;
  using CollectiveEpilogue =
      typename cutlass::epilogue::collective::CollectiveBuilder<
          ArchTag, OperatorClass, typename ScheduleConfig::MmaTileShape,
          typename ScheduleConfig::ClusterShape,
          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
          ElementAccumulator, void, LayoutC*, AlignmentC, ElementD, LayoutC*,
          AlignmentC, typename ScheduleConfig::EpilogueSchedule>::CollectiveOp;
  using CollectiveMainloop =
      typename cutlass::gemm::collective::CollectiveBuilder<
          ArchTag, OperatorClass, ElementA,
          cute::tuple<LayoutA*, typename ScheduleConfig::LayoutSFA*>,
          AlignmentA, ElementB,
          cute::tuple<LayoutB*, typename ScheduleConfig::LayoutSFB*>,
          AlignmentB, ElementAccumulator, typename ScheduleConfig::MmaTileShape,
          typename ScheduleConfig::ClusterShape,
          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
              sizeof(typename CollectiveEpilogue::SharedStorage))>,
          typename ScheduleConfig::KernelSchedule>::CollectiveOp;
  using GemmKernel =
      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
                                           CollectiveEpilogue, void>;
  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
  using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
  int num_experts = (int)expert_offsets.size(0);
  Gemm gemm_op;
  // Mainloop Arguments
  typename GemmKernel::MainloopArguments mainloop_args{
      static_cast<const ElementA**>(a_ptrs.data_ptr()),
      static_cast<StrideA*>(stride_a.data_ptr()),
      static_cast<const ElementB**>(b_ptrs.data_ptr()),
      static_cast<StrideB*>(stride_b.data_ptr()),
      static_cast<const ElementAccumulator**>(a_scales_ptrs.data_ptr()),
      reinterpret_cast<typename ScheduleConfig::LayoutSFA*>(
          layout_sfa.data_ptr()),
      static_cast<const ElementAccumulator**>(b_scales_ptrs.data_ptr()),
      reinterpret_cast<typename ScheduleConfig::LayoutSFB*>(
          layout_sfb.data_ptr())};
  cutlass::KernelHardwareInfo hw_info;
  hw_info.device_id = a_ptrs.get_device();
  hw_info.sm_count =
      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
          hw_info.device_id);
  // Epilogue Arguments
  typename GemmKernel::EpilogueArguments epilogue_args{
      {},  // epilogue.thread
      nullptr,
      static_cast<StrideC*>(stride_c.data_ptr()),
      static_cast<ElementD**>(out_ptrs.data_ptr()),
      static_cast<StrideC*>(stride_c.data_ptr())};
  UnderlyingProblemShape* problem_sizes_as_shapes =
      static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
  // Gemm Arguments
  typename GemmKernel::Arguments args{
      cutlass::gemm::GemmUniversalMode::kGrouped,
      {num_experts, problem_sizes_as_shapes, nullptr},
      mainloop_args,
      epilogue_args,
      hw_info};
  at::cuda::CUDAGuard device_guard{(char)a_ptrs.device().index()};
  const cudaStream_t stream =
      at::cuda::getCurrentCUDAStream(a_ptrs.get_device());
  auto can_implement_status = gemm_op.can_implement(args);
  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
              "Failed to implement GEMM");
  size_t workspace_size = gemm_op.get_workspace_size(args);
  auto const workspace_options =
      torch::TensorOptions().dtype(torch::kUInt8).device(a_ptrs.device());
  auto workspace = torch::empty(workspace_size, workspace_options);
  auto status = gemm_op.initialize(args, workspace.data_ptr(), stream);
  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
  status = gemm_op.run(stream);
  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
 }
 template <typename OutType>
 void blockwise_scaled_group_mm_dispatch_shape(
    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
  struct MmaConfig {
    using ElementA = cutlass::float_e4m3_t;
    using KernelSchedule =
        cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100;
    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
    using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
        1, 128, 128, cute::UMMA::Major::K, cute::UMMA::Major::K>;
    using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
    using LayoutC = cutlass::layout::RowMajor;
    using MmaTileShape = Shape<_128, _128, _128>;
    using ClusterShape = Shape<_1, _1, _1>;
  };
  int num_experts = (int)expert_offsets.size(0);
  auto a_ptrs = torch::empty(
      {num_experts},
      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
  auto b_ptrs = torch::empty(
      {num_experts},
      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
  auto out_ptrs = torch::empty(
      {num_experts},
      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
  auto a_scales_ptrs = torch::empty(
      {num_experts},
      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
  auto b_scales_ptrs = torch::empty(
      {num_experts},
      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
  auto layout_sfa = torch::empty(
      {num_experts, 5},
      torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
  auto layout_sfb = torch::empty(
      {num_experts, 5},
      torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
  auto stride_a = torch::full(
      {num_experts}, a.size(1),
      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
  auto stride_b = torch::full(
      {num_experts}, a.size(1),
      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
  auto stride_c = torch::full(
      {num_experts}, output.size(1),
      torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
  torch::TensorOptions options_int =
      torch::TensorOptions().dtype(torch::kInt64).device(a.device());
  run_get_ggemm_starts<typename MmaConfig::LayoutSFA,
                       typename MmaConfig::LayoutSFB,
                       typename MmaConfig::ScaleConfig>(
      expert_offsets, a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, a,
      b, output, scales_a, scales_b, layout_sfa, layout_sfb, problem_sizes);
  run_blockwise_scaled_group_mm<OutType, MmaConfig,
                                typename MmaConfig::LayoutC>(
      out_ptrs, a_ptrs, b_ptrs, a_scales_ptrs, b_scales_ptrs, stride_a,
      stride_b, stride_c, layout_sfa, layout_sfb, problem_sizes,
      expert_offsets);
 }
 void cutlass_blockwise_scaled_grouped_mm(
    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
  TORCH_CHECK(problem_sizes.size(1) == 3,
              "problem_sizes must have shape (num_experts, 3)");
  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
              "Number of experts in problem_sizes must match expert_offsets");
  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
              "problem_sizes must be int32");
  TORCH_CHECK(a.scalar_type() == torch::kFloat8_e4m3fn,
              "a must be kFloat8_e4m3fn");
  TORCH_CHECK(b.scalar_type() == torch::kFloat8_e4m3fn,
              "b must be kFloat8_e4m3fn");
  TORCH_CHECK(output.scalar_type() == torch::kBFloat16 ||
                  output.scalar_type() == torch::kHalf,
              "output must be bfloat16 or half");
  TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32,
              "scales_a must be float32");
  TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32,
              "scales_b must be float32");
  TORCH_CHECK(expert_offsets.scalar_type() == torch::kInt32,
              "expert_offsets must be int32");
  TORCH_CHECK(output.dim() == 2, "output must be 2D tensor");
  TORCH_CHECK(a.dim() == 2, "a must be 2D tensor");
  TORCH_CHECK(b.dim() == 3, "b must be 3D tensor");
  TORCH_CHECK(scales_a.dim() == 2, "scales_a must be 2D tensor");
  TORCH_CHECK(scales_b.dim() == 3, "scales_b must be 3D tensor");
  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
  TORCH_CHECK(problem_sizes.size(1) == 3,
              "problem_sizes must have shape (num_experts, 3)");
  TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
              "Number of experts in problem_sizes must match expert_offsets");
  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
              "problem_sizes must be int32");
  TORCH_CHECK(expert_offsets.dim() == 1, "expert_offsets must be 1D tensor");
 #if defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100
  if (output.scalar_type() == torch::kBFloat16) {
    blockwise_scaled_group_mm_dispatch_shape<cutlass::bfloat16_t>(
        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
  } else if (output.scalar_type() == torch::kFloat16) {
    blockwise_scaled_group_mm_dispatch_shape<cutlass::half_t>(
        output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
  } else {
    TORCH_CHECK(false, "Unsupported output tensor type");
  }
 #endif
 }
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
  m.impl("cutlass_blockwise_scaled_grouped_mm",
         &cutlass_blockwise_scaled_grouped_mm);
 }
--- a/csrc/quantization/machete/machete_mainloop.cuh
+++ b/csrc/quantization/machete/machete_mainloop.cuh
@@ -38,7 +38,6 @@
 #include "cute/atom/mma_atom.hpp"
 #include "cute/atom/copy_traits_sm90_tma.hpp"
 #include "cute/algorithm/gemm.hpp"
 #include "cute/tensor_predicate.hpp"
 #include "cute/numeric/arithmetic_tuple.hpp"
 #include "cutlass/pipeline/pipeline.hpp"
 #include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
--- a/csrc/quantization/vectorization_utils.cuh
+++ b/csrc/quantization/vectorization_utils.cuh
@@ -27,6 +27,26 @@ __device__ inline void vectorize_with_alignment(
  constexpr int WIDTH = VEC_SIZE * sizeof(InT);  // eg: 64 B
  uintptr_t addr = reinterpret_cast<uintptr_t>(in);
  // fast path when the whole region is already aligned
  // Note: currently the output is guaranteed to be same as the input, so we
  // don't check it here, comments here just for future reference.
  bool can_vec = ((addr & (WIDTH - 1)) == 0) && ((len & (VEC_SIZE - 1)) == 0);
  if (can_vec) {
    int num_vec = len / VEC_SIZE;
    using vin_t = vec_n_t<InT, VEC_SIZE>;
    using vout_t = vec_n_t<OutT, VEC_SIZE>;
    auto* v_in = reinterpret_cast<const vin_t*>(in);
    auto* v_out = reinterpret_cast<vout_t*>(out);
    for (int i = tid; i < num_vec; i += stride) {
      vout_t tmp;
      vec_op(tmp, v_in[i]);
      v_out[i] = tmp;
    }
    return;
  }
  int misalignment_offset = addr & (WIDTH - 1);       // addr % 64
  int alignment_bytes = WIDTH - misalignment_offset;  // 64 - (addr % 64)
  int prefix_elems = alignment_bytes & (WIDTH - 1);   // handle 64
@@ -72,4 +92,81 @@ __device__ __forceinline__ void vectorize_with_alignment(const InT* in,
                                     std::forward<ScaOp>(scalar_op));
 }
 template <int VEC_SIZE, typename InT, typename ScaOp>
 struct DefaultReadVecOp {
  ScaOp scalar_op;
  __device__ __forceinline__ void operator()(
      const vec_n_t<InT, VEC_SIZE>& src) const {
 #pragma unroll
    for (int i = 0; i < VEC_SIZE; ++i) {
      scalar_op(src.val[i]);
    }
  }
 };
 // read-only version: iterate over the input with alignment guarantees
 template <int VEC_SIZE, typename InT, typename VecOp, typename ScaOp>
 __device__ inline void vectorize_read_with_alignment(const InT* in, int len,
                                                     int tid, int stride,
                                                     VecOp&& vec_op,
                                                     ScaOp&& scalar_op) {
  static_assert(VEC_SIZE > 0 && (VEC_SIZE & (VEC_SIZE - 1)) == 0,
                "VEC_SIZE must be a positive power-of-two");
  constexpr int WIDTH = VEC_SIZE * sizeof(InT);
  uintptr_t addr = reinterpret_cast<uintptr_t>(in);
  // fast path when the whole region is already aligned
  bool can_vec = ((addr & (WIDTH - 1)) == 0) && ((len & (VEC_SIZE - 1)) == 0);
  if (can_vec) {
    int num_vec = len / VEC_SIZE;
    using vin_t = vec_n_t<InT, VEC_SIZE>;
    auto* v_in = reinterpret_cast<const vin_t*>(in);
    for (int i = tid; i < num_vec; i += stride) {
      vec_op(v_in[i]);
    }
    return;
  }
  int misalignment_offset = addr & (WIDTH - 1);
  int alignment_bytes = WIDTH - misalignment_offset;
  int prefix_elems = alignment_bytes & (WIDTH - 1);
  prefix_elems /= sizeof(InT);
  prefix_elems = min(prefix_elems, len);
  // 1. handle the possibly unaligned prefix with scalar access.
  for (int i = tid; i < prefix_elems; i += stride) {
    scalar_op(in[i]);
  }
  in += prefix_elems;
  len -= prefix_elems;
  int num_vec = len / VEC_SIZE;
  using vin_t = vec_n_t<InT, VEC_SIZE>;
  auto* v_in = reinterpret_cast<const vin_t*>(in);
  // 2. vectorized traversal of the main aligned region.
  for (int i = tid; i < num_vec; i += stride) {
    vec_op(v_in[i]);
  }
  // 3. handle remaining tail elements.
  int tail_start = num_vec * VEC_SIZE;
  for (int i = tid + tail_start; i < len; i += stride) {
    scalar_op(in[i]);
  }
 }
 // overload that requires only a scalar_op
 template <int VEC_SIZE, typename InT, typename ScaOp>
 __device__ __forceinline__ void vectorize_read_with_alignment(
    const InT* in, int len, int tid, int stride, ScaOp&& scalar_op) {
  using Vec = DefaultReadVecOp<VEC_SIZE, InT, std::decay_t<ScaOp>>;
  vectorize_read_with_alignment<VEC_SIZE>(in, len, tid, stride, Vec{scalar_op},
                                          std::forward<ScaOp>(scalar_op));
 }
 }  // namespace vllm
--- a/csrc/sampler.cu
+++ b/csrc/sampler.cu
@@ -59,6 +59,8 @@ void apply_repetition_penalties_(
  int vocab_size = logits.size(-1);
  int num_seqs = logits.size(0);
  if (num_seqs == 0) return;
  // Get number of SMs on the current device
  int sms = 0;
  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount,
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -79,7 +79,8 @@ struct cutlass_sparse_3x_gemm {
  // These are the minimum alignments needed for the kernels to compile
  static constexpr int AlignmentAB =
      128 / cutlass::sizeof_bits<ElementAB>::value;
-  static constexpr int AlignmentCD = 4;
+  static constexpr int AlignmentCD =
      128 / cutlass::sizeof_bits<ElementD>::value;
  using CollectiveEpilogue =
      typename cutlass::epilogue::collective::CollectiveBuilder<
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -393,6 +393,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      {stride_tag});
  ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
  // cutlass blockwise scaledgroup GEMM
  ops.def(
      "cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, "
      "Tensor scales_a, Tensor scales_b, "
      "Tensor problem_sizes, Tensor expert_offsets) -> ()",
      {stride_tag});
  // conditionally compiled so impl registration is in source file
  // cutlass nvfp4 block scaled group GEMM
  ops.def(
      "cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b,"
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,3 +1,4 @@
 # The vLLM Dockerfile is used to construct vLLM image that can be directly used
 # to run the OpenAI compatible server.
@@ -62,12 +63,16 @@ ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
 ARG PIP_KEYRING_PROVIDER=disabled
 ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER}
 # Flag enables build-in KV-connector dependency libs into docker images
 ARG INSTALL_KV_CONNECTORS=false
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
 FROM ${BUILD_BASE_IMAGE} AS base
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
 ARG TARGETPLATFORM
 ARG INSTALL_KV_CONNECTORS=false
 ENV DEBIAN_FRONTEND=noninteractive
 ARG DEADSNAKES_MIRROR_URL
@@ -276,6 +281,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 FROM ${FINAL_BASE_IMAGE} AS vllm-base
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
 ARG INSTALL_KV_CONNECTORS=false
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETPLATFORM
@@ -485,6 +491,7 @@ RUN mv mkdocs.yaml test_docs/
 # base openai image with additional requirements, for any subsequent openai-style images
 FROM vllm-base AS vllm-openai-base
 ARG TARGETPLATFORM
 ARG INSTALL_KV_CONNECTORS=false
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
@@ -493,8 +500,13 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 COPY requirements/kv_connectors.txt requirements/kv_connectors.txt
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
        uv pip install --system -r requirements/kv_connectors.txt; \
    fi; \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
    else \
--- a/docs/contributing/incremental_build.md
+++ b/docs/contributing/incremental_build.md
@@ -14,7 +14,7 @@ Before setting up the incremental build:
    VLLM_USE_PRECOMPILED=1 uv pip install -U -e . --torch-backend=auto
    ```
-2. **CUDA Toolkit:** Verify that the NVIDIA CUDA Toolkit is correctly installed and `nvcc` is accessible in your `PATH`. CMake relies on `nvcc` to compile CUDA code. You can typically find `nvcc` in `$CUDA_HOME/bin/nvcc` or by running `which nvcc`. If you encounter issues, refer to the [official CUDA Toolkit installation guides](https://developer.nvidia.com/cuda-toolkit-archive) and vLLM's main [GPU installation documentation](../getting_started/installation/gpu/cuda.inc.md#troubleshooting) for troubleshooting. The `CMAKE_CUDA_COMPILER` variable in your `CMakeUserPresets.json` should also point to your `nvcc` binary.
+2. **CUDA Toolkit:** Verify that the NVIDIA CUDA Toolkit is correctly installed and `nvcc` is accessible in your `PATH`. CMake relies on `nvcc` to compile CUDA code. You can typically find `nvcc` in `$CUDA_HOME/bin/nvcc` or by running `which nvcc`. If you encounter issues, refer to the [official CUDA Toolkit installation guides](https://developer.nvidia.com/cuda-toolkit-archive) and vLLM's main [GPU installation documentation](../getting_started/installation/gpu.md#troubleshooting) for troubleshooting. The `CMAKE_CUDA_COMPILER` variable in your `CMakeUserPresets.json` should also point to your `nvcc` binary.
 3. **Build Tools:** It is highly recommended to install `ccache` for fast rebuilds by caching compilation results (e.g., `sudo apt install ccache` or `conda install ccache`). Also, ensure the core build dependencies like `cmake` and `ninja` are installed. These are installable through `requirements/build.txt` or your system's package manager.
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -101,6 +101,49 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
 Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
 If using the [LLM.chat](https://docs.vllm.ai/en/stable/models/generative_models.html#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
 ```python
 from vllm import LLM
 from vllm.assets.image import ImageAsset
 llm = LLM(model="llava-hf/llava-1.5-7b-hf")
 image_url = "https://picsum.photos/id/32/512/512"
 image_pil = ImageAsset('cherry_blossom').pil_image
 image_embeds = torch.load(...)
 conversation = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "Hello"},
    {"role": "assistant", "content": "Hello! How can I assist you today?"},
    {
        "role": "user",
        "content": [{
            "type": "image_url",
            "image_url": {
                "url": image_url
            }
        },{
            "type": "image_pil",
            "image_pil": image_pil
        }, {
            "type": "image_embeds",
            "image_embeds": image_embeds
        }, {
            "type": "text",
            "text": "What's in these images?"
        }],
    },
 ]
 # Perform inference and log output.
 outputs = llm.chat(conversation)
 for o in outputs:
    generated_text = o.outputs[0].text
    print(generated_text)
 ```
 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
 ??? Code
@@ -228,7 +271,7 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
    If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>.
    If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
-    For certain models, we provide alternative chat templates inside <gh-dir:vllm/examples>.
+    For certain models, we provide alternative chat templates inside <gh-dir:examples>.
    For example, VLM2Vec uses <gh-file:examples/template_vlm2vec.jinja> which is different from the default one for Phi-3-Vision.
 ### Image Inputs
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -470,6 +470,7 @@ Specified using `--task classify`.
 |----------------------------------|----------|----------------------------------------|------------------------|-----------------------------|-----------------------|
 | `JambaForSequenceClassification` | Jamba    | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎                     | ✅︎                          |                       |
 | `GPT2ForSequenceClassification`  | GPT2     | `nie3e/sentiment-polish-gpt2-small`    |                        |                             | ✅︎                     |
 If your model is not in the above list, we will try to automatically convert the model using
 [as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
@@ -477,12 +478,20 @@ If your model is not in the above list, we will try to automatically convert the
 Specified using `--task score`.
-| Architecture                          | Models            | Example HF Models                                                                    | [V1](gh-issue:8779)   |
+| Architecture                          | Models            | Example HF Models                                                                    | [V1](gh-issue:8779) |
-|---------------------------------------|-------------------|--------------------------------------------------------------------------------------|-----------------------|
+|---------------------------------------|-------------------|--------------------------------------------------------------------------------------|---------------------|
-| `BertForSequenceClassification`       | BERT-based        | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.                                         |                       |
+| `BertForSequenceClassification`       | BERT-based        | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.                                         |                     |
-| `Qwen3ForSequenceClassification`      | Qwen3-based       | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎                     |
+| `Qwen2ForSequenceClassification`      | Qwen2-based       | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc.                                | ✅︎                  |
-| `RobertaForSequenceClassification`    | RoBERTa-based     | `cross-encoder/quora-roberta-base`, etc.                                             |                       |
+| `Qwen3ForSequenceClassification`      | Qwen3-based       | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎                  |
-| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc.                                                      |                       |
+| `RobertaForSequenceClassification`    | RoBERTa-based     | `cross-encoder/quora-roberta-base`, etc.                                             |                     |
 | `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc.                                                      |                     |
 !!! note
    Load the official original `mxbai-rerank-v2` by using the following command.
    ```bash
    vllm serve mixedbread-ai/mxbai-rerank-base-v2 --hf_overrides '{"architectures": ["Qwen2ForSequenceClassification"],"classifier_from_token": ["0", "1"], "method": "from_2_way_softmax"}'
    ```
 !!! note
    Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: <gh-file:examples/offline_inference/qwen3_reranker.py>.
@@ -490,6 +499,7 @@ Specified using `--task score`.
    ```bash
    vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
    ```
 [](){ #supported-mm-models }
 ## List of Multimodal Language Models
@@ -616,9 +626,6 @@ Specified using `--task generate`.
 !!! note
    Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently.
 !!! note
    `h2oai/h2ovl-mississippi-2b` will be available in V1 once we support head size 80.
 !!! note
    To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
@@ -661,11 +668,8 @@ Specified using `--task generate`.
    Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
 !!! note
-    To use Qwen2.5-Omni, you have to install Hugging Face Transformers library from source via
+    For Qwen2.5-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`)
-    `pip install git+https://github.com/huggingface/transformers.git`.
+    is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
    Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
    `--mm-processor-kwargs '{"use_audio_in_video": true}'`.
 #### Transcription
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@@ -6,6 +6,7 @@ import argparse
 from vllm import LLM
 from vllm.sampling_params import SamplingParams
 from vllm.assets.image import ImageAsset
 # This script is an offline demo for running Mistral-Small-3.1
 #
@@ -71,14 +72,16 @@ def run_simple_demo(args: argparse.Namespace):
    )
    prompt = "Describe this image in one sentence."
    image_url = "https://picsum.photos/id/237/200/300"
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
-                {"type": "image_url", "image_url": {"url": image_url}},
+                {
                    "type": "image_pil",
                    "image_pil": ImageAsset("cherry_blossom").pil_image,
                },
            ],
        },
    ]
--- a/examples/offline_inference/profiling_tpu/README.md
+++ b/examples/offline_inference/profiling_tpu/README.md
@@ -57,7 +57,10 @@ Once you have collected your profiles with this script, you can visualize them u
 Here are most likely the dependencies you need to install:
 ```bash
-pip install tensorflow-cpu tensorboard-plugin-profile etils importlib_resources
+pip install tensorflow-cpu \
    tensorboard-plugin-profile \
    etils \
    importlib_resources
 ```
 Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser:
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from transformers import AutoTokenizer
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -98,7 +98,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
    prompts = [f"Question: {question} Answer:" for question in questions]
    engine_args = EngineArgs(
-        model="Salesforce/blip2-opt-6.7b",
+        model="Salesforce/blip2-opt-2.7b",
        limit_mm_per_prompt={modality: 1},
    )
@@ -677,6 +677,7 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={modality: 1},
        ignore_patterns=["consolidated.safetensors"],
    )
    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
@@ -970,7 +971,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
    )
-# Qwen
+# Qwen-VL
 def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -505,6 +505,7 @@ def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        ignore_patterns=["consolidated.safetensors"],
    )
    placeholders = "[IMG]" * len(image_urls)
--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 import socket
--- a/examples/online_serving/multi_instance_data_parallel.py
+++ b/examples/online_serving/multi_instance_data_parallel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 from typing import Optional
--- a/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_xlam.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501
 """
 Set up this example by starting a vLLM OpenAI-compatible server with tool call
--- a/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501
 """
 Set up this example by starting a vLLM OpenAI-compatible server with tool call
--- a/examples/online_serving/structured_outputs/README.md
+++ b/examples/online_serving/structured_outputs/README.md
@@ -13,13 +13,15 @@ vllm serve Qwen/Qwen2.5-3B-Instruct
 To serve a reasoning model, you can use the following command:
 ```bash
-vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r1
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \
    --reasoning-parser deepseek_r1
 ```
 If you want to run this script standalone with `uv`, you can use the following:
 ```bash
-uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs structured-output
+uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs \
    structured-output
 ```
 See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.html) for more information.
@@ -44,7 +46,9 @@ uv run structured_outputs.py --stream
 Run certain constraints, for example `structural_tag` and `regex`, streaming:
 ```bash
-uv run structured_outputs.py --constraint structural_tag regex --stream
+uv run structured_outputs.py \
    --constraint structural_tag regex \
    --stream
 ```
 Run all constraints, with reasoning models and streaming:
--- a/examples/others/tensorize_vllm_model.py
+++ b/examples/others/tensorize_vllm_model.py
@@ -202,7 +202,7 @@ def parse_args():
-def deserialize():
+def deserialize(args, tensorizer_config):
    if args.lora_path:
        tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
        llm = LLM(model=args.model,
@@ -242,7 +242,7 @@ def deserialize():
    return llm
-if __name__ == '__main__':
+def main():
    args = parse_args()
    s3_access_key_id = (getattr(args, 's3_access_key_id', None)
@@ -260,8 +260,6 @@ if __name__ == '__main__':
    model_ref = args.model
    model_name = model_ref.split("/")[1]
    if args.command == "serialize" or args.command == "deserialize":
        keyfile = args.keyfile
    else:
@@ -309,6 +307,10 @@ if __name__ == '__main__':
                encryption_keyfile = keyfile,
                **credentials
            )
-        deserialize()
+        deserialize(args, tensorizer_config)
    else:
        raise ValueError("Either serialize or deserialize must be specified.")
 if __name__ == "__main__":
    main()
--- a/requirements/kv_connectors.txt
+++ b/requirements/kv_connectors.txt
@@ -0,0 +1 @@
 lmcache
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 import pytest
--- a/tests/config/test_mp_reducer.py
+++ b/tests/config/test_mp_reducer.py
@@ -0,0 +1,57 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import sys
 from unittest.mock import patch
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.v1.engine.async_llm import AsyncLLM
 def test_mp_reducer(monkeypatch):
    """
    Test that _reduce_config reducer is registered when AsyncLLM is instantiated
    without transformers_modules. This is a regression test for
    https://github.com/vllm-project/vllm/pull/18640.
    """
    # Use V1 AsyncLLM which calls maybe_register_config_serialize_by_value
    monkeypatch.setenv('VLLM_USE_V1', '1')
    # Ensure transformers_modules is not in sys.modules
    if 'transformers_modules' in sys.modules:
        del sys.modules['transformers_modules']
    with patch('multiprocessing.reducer.register') as mock_register:
        engine_args = AsyncEngineArgs(
            model="facebook/opt-125m",
            max_model_len=32,
            gpu_memory_utilization=0.1,
            disable_log_stats=True,
            disable_log_requests=True,
        )
        async_llm = AsyncLLM.from_engine_args(
            engine_args,
            start_engine_loop=False,
        )
        assert mock_register.called, (
            "multiprocessing.reducer.register should have been called")
        vllm_config_registered = False
        for call_args in mock_register.call_args_list:
            # Verify that a reducer for VllmConfig was registered
            if len(call_args[0]) >= 2 and call_args[0][0] == VllmConfig:
                vllm_config_registered = True
                reducer_func = call_args[0][1]
                assert callable(
                    reducer_func), "Reducer function should be callable"
                break
        assert vllm_config_registered, (
            "VllmConfig should have been registered to multiprocessing.reducer"
        )
        async_llm.shutdown()
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -37,7 +37,6 @@ async def test_basic_audio(mary_had_lamb):
    model_name = "openai/whisper-large-v3-turbo"
    server_args = ["--enforce-eager"]
    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
    prompt = "THE FIRST WORDS I SPOKE"
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
        transcription = await client.audio.transcriptions.create(
@@ -48,16 +47,6 @@ async def test_basic_audio(mary_had_lamb):
            temperature=0.0)
        out = json.loads(transcription)['text']
        assert "Mary had a little lamb," in out
        # This should "force" whisper to continue prompt in all caps
        transcription_wprompt = await client.audio.transcriptions.create(
            model=model_name,
            file=mary_had_lamb,
            language="en",
            response_format="text",
            prompt=prompt,
            temperature=0.0)
        out_capital = json.loads(transcription_wprompt)['text']
        assert prompt not in out_capital
@pytest.mark.asyncio
@@ -238,3 +227,31 @@ async def test_sampling_params(mary_had_lamb):
            extra_body=dict(seed=42))
        assert greedy_transcription.text != transcription.text
@pytest.mark.asyncio
 async def test_audio_prompt(mary_had_lamb):
    model_name = "openai/whisper-large-v3-turbo"
    server_args = ["--enforce-eager"]
    prompt = "This is a speech, recorded in a phonograph."
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        #Prompts should not omit the part of original prompt while transcribing.
        prefix = "The first words I spoke in the original phonograph"
        client = remote_server.get_async_client()
        transcription = await client.audio.transcriptions.create(
            model=model_name,
            file=mary_had_lamb,
            language="en",
            response_format="text",
            temperature=0.0)
        out = json.loads(transcription)['text']
        assert prefix in out
        transcription_wprompt = await client.audio.transcriptions.create(
            model=model_name,
            file=mary_had_lamb,
            language="en",
            response_format="text",
            prompt=prompt,
            temperature=0.0)
        out_prompt = json.loads(transcription_wprompt)['text']
        assert prefix in out_prompt
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -264,10 +264,8 @@ def test_parse_chat_messages_multiple_images(
                    "url": image_url
                }
            }, {
-                "type": "image_url",
+                "type": "image_pil",
-                "image_url": {
+                "image_pil": ImageAsset('cherry_blossom').pil_image
                    "url": image_url
                }
            }, {
                "type": "text",
                "text": "What's in these images?"
@@ -303,10 +301,8 @@ async def test_parse_chat_messages_multiple_images_async(
                    "url": image_url
                }
            }, {
-                "type": "image_url",
+                "type": "image_pil",
-                "image_url": {
+                "image_pil": ImageAsset('cherry_blossom').pil_image
                    "url": image_url
                }
            }, {
                "type": "text",
                "text": "What's in these images?"
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -450,7 +450,8 @@ def test_multi_query_kv_attention(
            start += seq_len
        # xformers.AttentionBias to Tensor for use in reference impl.
        alibi_bias = [
-            b.materialize(b.shape, device=device).squeeze() for b in attn_bias
+            b.materialize((1, num_query_heads, i, i), device=device).squeeze()
            for b, i in zip(attn_bias, seq_lens)
        ]
    else:
        attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -171,7 +171,7 @@ def test_env(
                    expected = "FLASHINFER_VLLM_V1" if use_v1 else name
                    assert backend.get_name() == expected
                else:
-                    backend = get_attn_backend(16,
+                    backend = get_attn_backend(32,
                                               torch.float16,
                                               torch.float16,
                                               block_size,
@@ -180,6 +180,45 @@ def test_env(
                    expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name
                    assert backend.get_name() == expected
                    if use_v1:
                        backend = get_attn_backend(16,
                                                   torch.float16,
                                                   torch.float16,
                                                   block_size,
                                                   False,
                                                   use_mla=use_mla)
                        assert backend.get_name() == "FLEX_ATTENTION", (
                            "Should fallback to FlexAttention if head size is "
                            "not supported by FlashAttention")
@pytest.mark.parametrize("device", ["cpu", "cuda"])
@pytest.mark.parametrize("use_v1", [True, False])
 def test_fp32_fallback(
    device: str,
    use_v1: bool,
    monkeypatch: pytest.MonkeyPatch,
 ):
    """Test attention backend selection with fp32."""
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
        if device == "cpu":
            with patch("vllm.attention.selector.current_platform",
                       CpuPlatform()):
                backend = get_attn_backend(16, torch.float32, torch.float32,
                                           16, False)
            assert (backend.get_name() == "TORCH_SDPA_VLLM_V1"
                    if use_v1 else "TORCH_SDPA")
        elif device == "cuda":
            with patch("vllm.attention.selector.current_platform",
                       CudaPlatform()):
                backend = get_attn_backend(16, torch.float32, torch.float32,
                                           16, False)
            assert (backend.get_name() == "FLEX_ATTENTION"
                    if use_v1 else "XFORMERS")
 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
    """Test FlashAttn validation."""
--- a/tests/kernels/moe/parallel_utils.py
+++ b/tests/kernels/moe/parallel_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 DeepEP test utilities
 """
@@ -137,8 +138,7 @@ def make_deepep_ht_a2a(pg: ProcessGroup,
                            low_latency_mode=low_latency_mode,
                            num_qps_per_rank=num_qps_per_rank)
    return DeepEPHTPrepareAndFinalize(buffer=buffer,
-                                      world_size=pgi.world_size,
+                                      num_dispatchers=pgi.world_size,
                                      rank=pgi.rank,
                                      dp_size=dp_size,
                                      rank_expert_offset=pgi.rank *
                                      ht_args.num_local_experts)
@@ -146,7 +146,6 @@ def make_deepep_ht_a2a(pg: ProcessGroup,
 def make_deepep_ll_a2a(pg: ProcessGroup,
                       pgi: ProcessGroupInfo,
                       dp_size: int,
                       deepep_ll_args: DeepEPLLArgs,
                       q_dtype: Optional[torch.dtype] = None,
                       block_shape: Optional[list[int]] = None):
@@ -166,8 +165,7 @@ def make_deepep_ll_a2a(pg: ProcessGroup,
    return DeepEPLLPrepareAndFinalize(
        buffer=buffer,
-        world_size=pgi.world_size,
+        num_dispatchers=pgi.world_size,
        dp_size=dp_size,
        max_tokens_per_rank=deepep_ll_args.max_tokens_per_rank,
        use_fp8_dispatch=deepep_ll_args.use_fp8_dispatch,
    )
@@ -186,5 +184,4 @@ def make_deepep_a2a(pg: ProcessGroup,
                                  block_shape)
    assert deepep_ll_args is not None
-    return make_deepep_ll_a2a(pg, pgi, dp_size, deepep_ll_args, q_dtype,
+    return make_deepep_ll_a2a(pg, pgi, deepep_ll_args, q_dtype, block_shape)
                              block_shape)
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -10,7 +10,7 @@ import triton.language as tl
 from tests.kernels.moe.utils import (batched_moe,
                                     make_quantized_test_activations,
-                                     make_test_weights, triton_moe)
+                                     make_test_weights, naive_batched_moe)
 from tests.kernels.quant_utils import native_batched_masked_quant_matmul
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig, set_current_vllm_config
@@ -33,12 +33,10 @@ MNK_FACTORS = [
    (45, 512, 512),
    (45, 1024, 128),
    (45, 1024, 2048),
    (64, 128, 128),
    (64, 512, 512),
    (64, 1024, 2048),
    (222, 128, 128),
    (222, 128, 2048),
    (222, 512, 512),
    (222, 1024, 128),
    (222, 1024, 2048),
 ]
@@ -95,11 +93,12 @@ class BatchedMMTensors:
@pytest.mark.parametrize("max_tokens_per_expert",
                         [32, 64, 128, 192, 224, 256, 512])
@pytest.mark.parametrize("K", [128, 256, 1024])
-@pytest.mark.parametrize("N", [128, 256, 512, 1024])
+@pytest.mark.parametrize("N", [128, 256, 1024])
-@pytest.mark.parametrize("dtype",
+@pytest.mark.parametrize(
-                         [torch.float32, torch.float16, torch.bfloat16])
+    "dtype",
-@pytest.mark.parametrize("block_shape", [None])
+    [torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("per_act_token_quant", [False])
+@pytest.mark.parametrize("block_shape", [None, [128, 128]])
@pytest.mark.parametrize("per_act_token_quant", [False, True])
 def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
                    N: int, dtype: torch.dtype,
                    block_shape: Optional[list[int]],
@@ -134,7 +133,8 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
        in_dtype=act_dtype,
        quant_dtype=quant_dtype,
        block_shape=block_shape,
-        per_act_token_quant=per_act_token_quant)
+        per_act_token_quant=per_act_token_quant,
    )
    B, B_q, B_scale, _, _, _ = make_test_weights(
        num_experts,
@@ -143,6 +143,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
        in_dtype=act_dtype,
        quant_dtype=quant_dtype,
        block_shape=block_shape,
        per_act_token_quant=per_act_token_quant,
    )
    out_shape = (num_experts, max_tokens_per_expert, N)
@@ -177,6 +178,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
            "BLOCK_SIZE_N": 16,
            "BLOCK_SIZE_K": 16 if dtype.itemsize > 1 else 32
        },
        per_act_token_quant=per_act_token_quant,
        block_shape=block_shape,
    )
@@ -185,15 +187,13 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
        B,
        ref_output,
        num_expert_tokens,
        None,
        None,
        None,
    )
    q_ref_output = native_batched_masked_quant_matmul(A_q, B_q, q_ref_output,
                                                      num_expert_tokens,
                                                      A_scale, B_scale,
-                                                      block_shape)
+                                                      block_shape,
                                                      per_act_token_quant)
    rtol, atol = {
        torch.float16: (6e-2, 6e-2),
@@ -201,16 +201,17 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
        torch.float32: (1e-2, 1e-2),
    }[test_output.dtype]
-    torch.testing.assert_close(ref_output, test_output, atol=atol, rtol=rtol)
+    torch.testing.assert_close(ref_output, q_ref_output, atol=atol, rtol=rtol)
    torch.testing.assert_close(test_output, q_ref_output, atol=atol, rtol=rtol)
@pytest.mark.parametrize(("m", "n", "k"), MNK_FACTORS)
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
-@pytest.mark.parametrize("per_act_token_quant", [False])
+@pytest.mark.parametrize("per_act_token_quant", [False, True])
-@pytest.mark.parametrize("block_shape", [None])
+@pytest.mark.parametrize("block_shape", [None, [128, 128]])
@pytest.mark.parametrize("input_scales", [False])
 def test_fused_moe_batched_experts(
    m: int,
    n: int,
@@ -220,15 +221,19 @@ def test_fused_moe_batched_experts(
    dtype: torch.dtype,
    per_act_token_quant: bool,
    block_shape: Optional[list[int]],
    input_scales: bool,
 ):
    current_platform.seed_everything(7)
    use_fp8_w8a8 = dtype == torch.float8_e4m3fn
    if topk > e:
        pytest.skip("topk > e")
    if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
        pytest.skip("Skip quantization test for non-quantized type")
-    if per_act_token_quant and block_shape is not None or topk > e:
+    if per_act_token_quant and block_shape is not None:
        pytest.skip("Skip illegal quantization test.")
    a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
@@ -241,27 +246,26 @@ def test_fused_moe_batched_experts(
        act_dtype = dtype
        quant_dtype = None
-    _, w1, w1_s, _, w2, w2_s = make_test_weights(e,
+    w1_16, w1, w1_s, w2_16, w2, w2_s = make_test_weights(
-                                                 n,
+        e,
-                                                 k,
+        n,
-                                                 block_shape=block_shape,
+        k,
-                                                 in_dtype=act_dtype,
+        block_shape=block_shape,
-                                                 quant_dtype=quant_dtype)
+        in_dtype=act_dtype,
        quant_dtype=quant_dtype,
        per_act_token_quant=per_act_token_quant,
    )
    if input_scales and quant_dtype is not None:
        a1_scale = torch.tensor(1, device="cuda", dtype=torch.float32)
        a2_scale = torch.tensor(1, device="cuda", dtype=torch.float32)
    else:
        a1_scale = None
        a2_scale = None
    with set_current_vllm_config(vllm_config):
        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-        batched_output = batched_moe(
+
            a,
            w1,
            w2,
            topk_weight,
            topk_ids,
            w1_scale=w1_s,
            w2_scale=w2_s,
            quant_dtype=quant_dtype,
            per_act_token_quant=per_act_token_quant,
            block_shape=block_shape,
        )
        baseline_output = torch_experts(
            a,
            w1,
@@ -270,11 +274,14 @@ def test_fused_moe_batched_experts(
            topk_ids,
            w1_scale=w1_s,
            w2_scale=w2_s,
            a1_scale=a1_scale,
            a2_scale=a2_scale,
            quant_dtype=quant_dtype,
            per_act_token_quant=per_act_token_quant,
-            block_shape=block_shape)
+            block_shape=block_shape,
        )
-        triton_output = triton_moe(
+        batched_output = naive_batched_moe(
            a,
            w1,
            w2,
@@ -282,14 +289,31 @@ def test_fused_moe_batched_experts(
            topk_ids,
            w1_scale=w1_s,
            w2_scale=w2_s,
            a1_scale=a1_scale,
            a2_scale=a2_scale,
            quant_dtype=quant_dtype,
            per_act_token_quant=per_act_token_quant,
            block_shape=block_shape,
        )
-    torch.testing.assert_close(triton_output,
+        triton_output = batched_moe(
            a,
            w1,
            w2,
            topk_weight,
            topk_ids,
            w1_scale=w1_s,
            w2_scale=w2_s,
            a1_scale=a1_scale,
            a2_scale=a2_scale,
            quant_dtype=quant_dtype,
            per_act_token_quant=per_act_token_quant,
            block_shape=block_shape,
        )
    torch.testing.assert_close(batched_output,
                               baseline_output,
-                               atol=2e-2,
+                               atol=3e-2,
                               rtol=2e-2)
    torch.testing.assert_close(triton_output,
--- a/tests/kernels/moe/test_cutlass_grouped_gemm.py
+++ b/tests/kernels/moe/test_cutlass_grouped_gemm.py
@@ -0,0 +1,116 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # DeepGEMM Style Cutlass Grouped GEMM Test
 # See https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_core.py
 import random
 import pytest
 import torch
 from tests.kernels.utils import baseline_scaled_mm
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 def cdiv(a, b):
    return (a + b - 1) // b
 def per_token_cast_to_fp8(
        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
    assert x.dim() == 2
    m, n = x.shape
    pad_size = (128 - (n % 128)) % 128
    x = torch.nn.functional.pad(x,
                                (0, pad_size), value=0) if pad_size > 0 else x
    x_view = x.view(m, -1, 128)
    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
    fp8_data = (x_view *
                (448.0 / x_amax.unsqueeze(2))).to(dtype=torch.float8_e4m3fn)
    return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
 def per_block_cast_to_fp8(
        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
    assert x.dim() == 2
    m, n = x.shape
    x_padded = torch.zeros((cdiv(m, 128) * 128, cdiv(n, 128) * 128),
                           device=x.device,
                           dtype=x.dtype)
    x_padded[:m, :n] = x
    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
    x_scaled = (x_view * (448.0 / x_amax)).to(dtype=torch.float8_e4m3fn)
    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
        x_amax / 448.0).view(x_view.size(0), x_view.size(2))
@pytest.mark.parametrize("num_groups, expected_m_per_group, k, n", [
    (4, 8192, 7168, 4096),
    (4, 8192, 2048, 7168),
    (8, 4096, 7168, 4096),
    (8, 4096, 2048, 7168),
    (32, 1024, 7168, 4096),
    (32, 1024, 2048, 7168),
 ])
@pytest.mark.parametrize("out_dtype", [torch.float16])
@pytest.mark.skipif(
    (lambda x: x is None or x.to_int() != 100)(
        current_platform.get_device_capability()),
    reason="Block Scaled Grouped GEMM is only supported on SM100.")
 def test_cutlass_grouped_gemm(
    num_groups: int,
    expected_m_per_group: int,
    k: int,
    n: int,
    out_dtype: torch.dtype,
 ):
    device = "cuda"
    alignment = 128
    group_ms = [
        int(expected_m_per_group * random.uniform(0.7, 1.3))
        for _ in range(num_groups)
    ]
    m = sum([cdiv(m, alignment) * alignment for m in group_ms])
    x = torch.randn((m, k), device=device, dtype=out_dtype)
    y = torch.randn((num_groups, n, k), device=device, dtype=out_dtype)
    out = torch.empty((m, n), device=device, dtype=out_dtype)
    ref_out = torch.randn((m, n), device=device, dtype=out_dtype)
    ep_offset = [0] + [sum(group_ms[:i]) for i in range(1, num_groups)] + [m]
    pb_size = []
    for i in range(num_groups):
        pb_size.append([ep_offset[i + 1] - ep_offset[i], n, k])
    problem_sizes = torch.tensor(pb_size, device=device, dtype=torch.int32)
    expert_offsets = torch.tensor(ep_offset, device=device, dtype=torch.int32)
    x_fp8 = per_token_cast_to_fp8(x)
    y_fp8 = (torch.empty_like(y, dtype=torch.float8_e4m3fn),
             torch.empty((num_groups, cdiv(n, 128), k // 128),
                         device=device,
                         dtype=torch.float))
    for i in range(num_groups):
        y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i])
    for i in range(num_groups):
        a = x_fp8[0][ep_offset[i]:ep_offset[i + 1]]
        a_scale = x_fp8[1][ep_offset[i]:ep_offset[i + 1]]
        b = y_fp8[0][i].t()
        b_scale = y_fp8[1][i].t()
        baseline = baseline_scaled_mm(a, b, a_scale, b_scale, out_dtype)
        ref_out[ep_offset[i]:ep_offset[i + 1]] = baseline
    ops.cutlass_blockwise_scaled_grouped_mm(
        out,
        x_fp8[0],
        y_fp8[0],
        x_fp8[1],
        y_fp8[1],
        problem_sizes,
        expert_offsets[:-1],
    )
    torch.testing.assert_close(ref_out, out, atol=5e-1, rtol=1e-3)
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Test DeepEP + DeepGEMM integration
 DeepGEMM are gemm kernels specialized for the
@@ -148,8 +149,7 @@ def make_ll_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
    fused_experts = BatchedDeepGemmExperts(
        max_num_tokens=max_tokens_per_rank,
-        world_size=pgi.world_size,
+        num_dispatchers=pgi.world_size // dp_size,
        dp_size=dp_size,
        block_shape=test_config.block_size,
        per_act_token_quant=test_config.per_act_token_quant)
    mk = FusedMoEModularKernel(prepare_finalize=a2a,
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Test deepep dispatch-combine logic
 """
@@ -154,12 +155,13 @@ def make_modular_kernel(
                        deepep_ht_args = ht_args,
                        deepep_ll_args = ll_args)
    num_dispatchers = pgi.world_size // dp_size
    if low_latency_mode:
        assert not per_act_token_quant, "not supported in ll mode"
        fused_experts = BatchedTritonExperts(
            max_num_tokens=MAX_TOKENS_PER_RANK,
-            world_size=pgi.world_size,
+            num_dispatchers=num_dispatchers,
            dp_size=dp_size,
            use_fp8_w8a8=is_quantized,
            use_int8_w8a8=False,
            use_int8_w8a16=False,
--- a/tests/kernels/moe/test_deepgemm.py
+++ b/tests/kernels/moe/test_deepgemm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Unit-test DeepGEMM FP8 kernels (no DeepEP).
 Compare DeepGEMM path against the Triton fallback inside vLLM's fused_experts.
--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -14,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
    FusedMoEModularKernel)
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
 from .parallel_utils import ProcessGroupInfo, parallel_launch
@@ -112,18 +113,21 @@ def pplx_cutlass_moe(
    w2_scale = w2_scale.to(device)
    a1_scale = a1_scale.to(device)
    assert num_experts % world_size == 0
    num_local_experts = cdiv(num_experts, world_size)
    num_dispatchers = pgi.world_size // dp_size
    prepare_finalize = PplxPrepareAndFinalize(
        ata,
-        max_num_tokens,
+        max_num_tokens=max_num_tokens,
-        pgi.world_size,
+        num_local_experts=num_local_experts,
-        rank,
+        num_dispatchers=num_dispatchers)
        dp_size,
    )
-    experts = CutlassExpertsFp8((num_experts + world_size - 1) // world_size,
+    experts = CutlassExpertsFp8(num_local_experts,
                                out_dtype,
                                per_act_token,
                                per_out_ch,
                                num_dispatchers=num_dispatchers,
                                use_batched_format=True)
    fused_cutlass_experts = FusedMoEModularKernel(
@@ -181,35 +185,40 @@ def _pplx_moe(
    per_out_ch: bool,
    use_internode: bool,
 ):
-    if use_internode:
+    try:
-        uid = nvshmem_get_unique_id(
+        if use_internode:
-        ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+            uid = nvshmem_get_unique_id(
-        torch.distributed.broadcast(uid, src=0)
+            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
-        nvshmem_init(uid, pgi.rank, pgi.world_size)
+            torch.distributed.broadcast(uid, src=0)
-    else:
+            nvshmem_init(uid, pgi.rank, pgi.world_size)
-        group_ranks = list(range(pgi.world_size))
+        else:
-        cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
+            group_ranks = list(range(pgi.world_size))
-        group_name = cpu_group.group_name
+            cpu_group = torch.distributed.new_group(group_ranks,
                                                    backend="gloo")
            group_name = cpu_group.group_name
-    with set_current_vllm_config(vllm_config):
+        with set_current_vllm_config(vllm_config):
-        torch_output = torch_experts(a_full, w1_full, w2_full, topk_weights,
+            torch_output = torch_experts(a_full, w1_full, w2_full,
-                                     topk_ids)
+                                         topk_weights, topk_ids)
-        pplx_output = pplx_cutlass_moe(pgi, dp_size, a, w1, w2, w1_scale,
+            pplx_output = pplx_cutlass_moe(pgi, dp_size, a, w1, w2, w1_scale,
-                                       w2_scale, topk_weights, topk_ids,
+                                           w2_scale, topk_weights, topk_ids,
-                                       a1_scale, out_dtype, per_act_token,
+                                           a1_scale, out_dtype, per_act_token,
-                                       per_out_ch, group_name)
+                                           per_out_ch, group_name)
-        torch_output = chunk_by_rank(torch_output, pgi.rank,
+            torch_output = chunk_by_rank(torch_output, pgi.rank,
-                                     pgi.world_size).to(pplx_output.device)
+                                         pgi.world_size).to(pplx_output.device)
-    # Uncomment if more debugging is needed
+        # Uncomment if more debugging is needed
-    # print("PPLX OUT:", pplx_output)
+        # print("PPLX OUT:", pplx_output)
-    # print("TORCH OUT:", torch_output)
+        # print("TORCH OUT:", torch_output)
-    torch.testing.assert_close(pplx_output, torch_output, atol=0.05, rtol=0)
+        torch.testing.assert_close(pplx_output,
-
+                                   torch_output,
-    if use_internode:
+                                   atol=0.05,
-        nvshmem_finalize()
+                                   rtol=0)
    finally:
        if use_internode:
            nvshmem_finalize()
@pytest.mark.parametrize("m", [2, 224])
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -4,7 +4,10 @@
 Run `pytest tests/kernels/test_pplx_moe.py`.
 """
-from typing import Optional
+import itertools
 import textwrap
 import traceback
 from typing import Callable, Optional
 import pytest
 import torch
@@ -19,12 +22,13 @@ except ImportError:
    has_pplx = False
 from tests.kernels.moe.utils import make_test_weights, naive_batched_moe
 from tests.kernels.quant_utils import dequant
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe import fused_topk, override_config
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
+    BatchedTritonExperts)
 from vllm.model_executor.layers.fused_moe.fused_moe import get_default_config
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
    FusedMoEModularKernel)
@@ -38,22 +42,22 @@ requires_pplx = pytest.mark.skipif(
    reason="Requires PPLX kernels",
 )
-PPLX_PREPARE_COMBOS = [(4, 128, 128), (32, 1024, 512), (64, 1024, 512),
+PPLX_COMBOS = [
-                       (222, 2048, 1024)]
+    # TODO: figure out why this fails, seems to be test problem
-
+    #(1, 128, 128),
 PPLX_MOE_COMBOS = [
    (1, 128, 128),
    (2, 128, 512),
    (3, 1024, 2048),
-    (32, 128, 1024),
+    (4, 128, 128),
    (32, 1024, 512),
    (45, 512, 2048),
-    (64, 1024, 1024),
+    (64, 1024, 512),
-    (222, 1024, 2048),
+    (222, 2048, 1024),
    (256, 1408, 2048),
 ]
 NUM_EXPERTS = [8, 64]
 EP_SIZE = [1, 4]
 TOP_KS = [1, 2, 6]
 DTYPES = [torch.float8_e4m3fn, torch.bfloat16]
 vllm_config = VllmConfig()
 vllm_config.scheduler_config.max_num_seqs = 128
@@ -169,9 +173,11 @@ def test_fused_moe_batched_experts(
    with set_current_vllm_config(vllm_config):
        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-        baseline_output = torch_experts(a, w1, w2, topk_weight, topk_ids)
+        baseline_output = torch_experts(a, w1, w2, topk_weight,
                                        topk_ids)  # only for baseline
        torch_output = torch_batched_moe(a, w1, w2, topk_weight, topk_ids)
-        batched_output = naive_batched_moe(a, w1, w2, topk_weight, topk_ids)
+        batched_output = naive_batched_moe(
            a, w1, w2, topk_weight, topk_ids)  # pick torch_experts or this
    torch.testing.assert_close(baseline_output,
                               torch_output,
@@ -183,6 +189,63 @@ def test_fused_moe_batched_experts(
                               rtol=0)
 def create_pplx_prepare_finalize(
    num_tokens: int,
    hidden_dim: int,
    topk: int,
    num_experts: int,
    rank: int,
    dp_size: int,
    world_size: int,
    in_dtype: torch.dtype,
    quant_dtype: Optional[torch.dtype],
    block_shape: Optional[list[int]],
    per_act_token_quant: bool,
    group_name: Optional[str],
 ):
    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
        PplxPrepareAndFinalize, pplx_hidden_dim_scale_bytes)
    max_num_tokens = max(rank_chunk(num_tokens, 0, world_size), 1)
    num_local_experts = rank_chunk(num_experts, 0, world_size)
    hidden_dim_bytes, scale_bytes = pplx_hidden_dim_scale_bytes(
        max_num_tokens,
        hidden_dim,
        in_dtype,
        quant_dtype,
        per_act_token_quant=per_act_token_quant,
        block_shape=block_shape,
    )
    args = dict(
        max_num_tokens=max_num_tokens,
        num_experts=num_experts,
        experts_per_token=topk,
        rank=rank,
        world_size=world_size,
        dp_size=dp_size,
        hidden_dim=hidden_dim,
        hidden_dim_bytes=hidden_dim_bytes,
        hidden_dim_scale_bytes=scale_bytes,
    )
    if group_name is None:
        ata = AllToAll.internode(**args)
    else:
        args["group_name"] = group_name
        ata = AllToAll.intranode(**args)
    prepare_finalize = PplxPrepareAndFinalize(
        ata,
        max_num_tokens=max_num_tokens,
        num_local_experts=num_local_experts,
        num_dispatchers=world_size // dp_size,
    )
    return prepare_finalize, ata
 def rank_chunk(num: int, r: int, w: int) -> int:
    rem = num % w
    return (num // w) + (1 if r < rem else 0)
@@ -193,6 +256,35 @@ def chunk_by_rank(t: torch.Tensor, r: int, w: int) -> torch.Tensor:
    return t[(r * chunk):(r + 1) * chunk]
 def maybe_chunk_by_rank(t: Optional[torch.Tensor], r: int,
                        w: int) -> Optional[torch.Tensor]:
    if t is not None:
        return chunk_by_rank(t, r, w)
    else:
        return t
 def chunk_scales_by_rank(t: Optional[torch.Tensor], r: int,
                         w: int) -> Optional[torch.Tensor]:
    if t is not None and t.numel() > 1:
        chunk = rank_chunk(t.shape[0], r, w)
        return t[(r * chunk):(r + 1) * chunk]
    else:
        return t
 def chunk_scales(t: Optional[torch.Tensor], start: int,
                 end: int) -> Optional[torch.Tensor]:
    if t is not None and t.numel() > 1:
        return t[start:end]
    else:
        return t
 def dummy_work(a: torch.Tensor) -> torch.Tensor:
    return a * 1.1
 def pplx_prepare_finalize(
    pgi: ProcessGroupInfo,
    dp_size: int,
@@ -200,11 +292,11 @@ def pplx_prepare_finalize(
    topk_weight: torch.Tensor,
    topk_ids: torch.Tensor,
    num_experts: int,
    quant_dtype: Optional[torch.dtype],
    block_shape: Optional[list[int]],
    per_act_token_quant: bool,
    group_name: Optional[str],
 ) -> torch.Tensor:
    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
        PplxPrepareAndFinalize)
    assert torch.cuda.current_device() == pgi.local_rank
    topk = topk_ids.shape[1]
@@ -212,60 +304,66 @@ def pplx_prepare_finalize(
    device = pgi.device
    rank = pgi.rank
    world_size = pgi.world_size
    max_num_tokens = rank_chunk(num_tokens, 0, world_size)
    args = dict(
        max_num_tokens=max_num_tokens,
        num_experts=num_experts,
        experts_per_token=topk,
        rank=rank,
        world_size=world_size,
        dp_size=dp_size,
        hidden_dim=hidden_dim,
        hidden_dim_bytes=hidden_dim * a.dtype.itemsize,
        hidden_dim_scale_bytes=0,
    )
    if group_name is None:
        ata = AllToAll.internode(**args)
    else:
        args["group_name"] = group_name
        ata = AllToAll.intranode(**args)
    topk_ids = topk_ids.to(dtype=torch.uint32)
-    prepare_finalize = PplxPrepareAndFinalize(
+    prepare_finalize, ata = create_pplx_prepare_finalize(
-        ata,
+        num_tokens,
-        max_num_tokens,
+        hidden_dim,
-        world_size,
+        topk,
        num_experts,
        rank,
        dp_size,
        world_size,
        a.dtype,
        quant_dtype,
        block_shape,
        per_act_token_quant,
        group_name,
    )
    assert a.shape[0] == topk_ids.shape[0]
    a_chunk = chunk_by_rank(a, rank, world_size).to(device)
    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
    assert a_chunk.shape[0] == chunk_topk_ids.shape[0]
    out = torch.full(
        a_chunk.shape,
        torch.nan,
        dtype=a.dtype,
        device=device,
    )
    if (quant_dtype is not None and not per_act_token_quant
            and block_shape is None):
        a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
        a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
    else:
        a1_scale = None
        a2_scale = None
    b_a, b_a_scale, expert_num_tokens, _, _ = prepare_finalize.prepare(
        a_chunk,
-        None,
+        a1_scale,
-        None,
+        a2_scale,
        chunk_topk_weight,
        chunk_topk_ids,
        num_experts,
        None,
        False,
-        FusedMoEQuantConfig(),
+        FusedMoEQuantConfig(
            quant_dtype,
            per_act_token_quant,
            False,
            block_shape,
        ),
    )
-    b_a = b_a * 1.5
+    b_a = dummy_work(
-
+        dequant(b_a, b_a_scale, block_shape, per_act_token_quant, a.dtype))
    out = torch.full(
        (max_num_tokens, hidden_dim),
        torch.nan,
        dtype=a.dtype,
        device=device,
    )
    prepare_finalize.finalize(
        out,
@@ -291,70 +389,96 @@ def _pplx_prepare_finalize(
    score: torch.Tensor,
    topk: torch.Tensor,
    num_experts: int,
    quant_dtype: Optional[torch.dtype],
    block_shape: Optional[list[int]],
    per_act_token_quant: bool,
    use_internode: bool,
 ):
-    if use_internode:
+    try:
-        uid = nvshmem_get_unique_id(
+        if use_internode:
-        ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+            uid = nvshmem_get_unique_id(
-        torch.distributed.broadcast(uid, src=0)
+            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
-        nvshmem_init(uid, pgi.rank, pgi.world_size)
+            torch.distributed.broadcast(uid, src=0)
-        group_name = None
+            nvshmem_init(uid, pgi.rank, pgi.world_size)
-    else:
+            group_name = None
-        group_ranks = list(range(pgi.world_size))
+        else:
-        cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
+            group_ranks = list(range(pgi.world_size))
-        group_name = cpu_group.group_name
+            cpu_group = torch.distributed.new_group(group_ranks,
                                                    backend="gloo")
            group_name = cpu_group.group_name
-    device = pgi.device
+        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
        m, k = a.shape
-    topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
+        a_rep = torch.repeat_interleave(dummy_work(a), topk, dim=0)
    k = a.shape[1]
-    a_rep = torch.repeat_interleave(a, topk, dim=0).to(device)
+        torch_output = (a_rep.view(m, topk, k) *
                        topk_weight.view(m, topk, 1).to(a_rep.dtype)).sum(
                            dim=1)
-    torch_output = (a_rep.view(-1, topk, k) * 1.5 *
+        pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight,
-                    topk_weight.view(-1, topk, 1).to(device)).sum(dim=1).to(
+                                            topk_ids, num_experts, quant_dtype,
-                        a.dtype)
+                                            block_shape, per_act_token_quant,
                                            group_name)
-    pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight, topk_ids,
+        torch_output = chunk_by_rank(torch_output, pgi.rank,
-                                        num_experts, group_name)
+                                     pgi.world_size).to(pgi.device)
-    torch_output = chunk_by_rank(torch_output, pgi.rank,
+        torch.testing.assert_close(pplx_output,
-                                 pgi.world_size).to(pplx_output.device)
+                                   torch_output,
-
+                                   atol=3e-2,
-    torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0)
+                                   rtol=3e-2)
-
+    finally:
-    if use_internode:
+        if use_internode:
-        nvshmem_finalize()
+            nvshmem_finalize()
-# TODO (bnell): this test point does not work for odd M due to how the test is
+@pytest.mark.parametrize("mnk", PPLX_COMBOS)
 # written, not due to limitations of the pplx kernels.  The pplx_moe
 # test below is able to deal with odd M.
 # TODO (bnell) add fp8 tests
@pytest.mark.parametrize("mnk", PPLX_PREPARE_COMBOS)
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
@pytest.mark.parametrize("per_act_token_quant", [False, True])
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
@pytest.mark.parametrize("use_internode", [False])
@pytest.mark.optional
@requires_pplx
-def test_pplx_prepare_finalize(
+def test_pplx_prepare_finalize_slow(
    mnk: tuple[int, int, int],
    e: int,
    topk: int,
    dtype: torch.dtype,
    world_dp_size: tuple[int, int],
    per_act_token_quant: bool,
    block_shape: Optional[list[int]],
    use_internode: bool,
 ):
    if dtype == torch.float8_e4m3fn:
        use_fp8_w8a8 = True
        act_dtype = torch.bfloat16
        quant_dtype = dtype
    else:
        use_fp8_w8a8 = False
        act_dtype = dtype
        quant_dtype = None
    if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
        pytest.skip("Skip quantization test for non-quantized type")
    if per_act_token_quant and block_shape is not None:
        pytest.skip("Skip illegal quantization combination")
    current_platform.seed_everything(7)
    m, n, k = mnk
    world_size, dp_size = world_dp_size
    device = "cuda"
-    a = torch.randn((m, k), device=device, dtype=dtype) / 10
+
-    score = torch.randn((m, e), device=device, dtype=dtype)
+    a = torch.randn((m, k), device=device, dtype=act_dtype) / 10
    score = torch.randn((m, e), device=device, dtype=act_dtype)
    parallel_launch(world_size, _pplx_prepare_finalize, dp_size, a, score,
-                    topk, e, use_internode)
+                    topk, e, quant_dtype, block_shape, per_act_token_quant,
                    use_internode)
 def pplx_moe(
@@ -369,84 +493,62 @@ def pplx_moe(
    topk_ids: torch.Tensor,
    w1_scale: Optional[torch.Tensor] = None,
    w2_scale: Optional[torch.Tensor] = None,
-    qtype: Optional[torch.dtype] = None,
+    a1_scale: Optional[torch.Tensor] = None,
    a2_scale: Optional[torch.Tensor] = None,
    quant_dtype: Optional[torch.dtype] = None,
    per_act_token_quant=False,
    block_shape: Optional[list[int]] = None,
    use_compile: bool = False,
    use_cudagraphs: bool = True,
 ) -> torch.Tensor:
    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
        PplxPrepareAndFinalize, pplx_hidden_dim_scale_bytes)
-    device = torch.device("cuda", rank)
+    num_tokens, hidden_dim = a.shape
    hidden_dim = a.shape[1]
    num_experts = w1.shape[0]
    topk = topk_ids.shape[1]
-    max_num_tokens = round_up(rank_chunk(a.shape[0], 0, world_size), 64)
+    max_num_tokens = round_up(rank_chunk(a.shape[0], 0, world_size), 16)
-    hidden_dim_bytes, scale_bytes = pplx_hidden_dim_scale_bytes(
+    prepare_finalize, ata = create_pplx_prepare_finalize(
-        max_num_tokens,
+        num_tokens,
        hidden_dim,
        topk,
        num_experts,
        rank,
        dp_size,
        world_size,
        a.dtype,
-        qtype,
+        quant_dtype,
-        per_act_token_quant=per_act_token_quant,
+        block_shape,
-        block_shape=block_shape,
+        per_act_token_quant,
        group_name,
    )
    args = dict(
        max_num_tokens=max_num_tokens,
        num_experts=num_experts,
        experts_per_token=topk,
        rank=rank,
        world_size=world_size,
        dp_size=dp_size,
        hidden_dim=hidden_dim,
        hidden_dim_bytes=hidden_dim_bytes,
        hidden_dim_scale_bytes=scale_bytes,
    )
    if group_name is None:
        ata = AllToAll.internode(**args)
    else:
        args["group_name"] = group_name
        ata = AllToAll.intranode(**args)
    topk_ids = topk_ids.to(dtype=torch.uint32)
-    prepare_finalize = PplxPrepareAndFinalize(
+    experts = BatchedTritonExperts(
-        ata,
+        max_num_tokens=max_num_tokens,
-        max_num_tokens,
+        num_dispatchers=prepare_finalize.num_dispatchers(),
-        world_size,
+        use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
-        rank,
+        block_shape=block_shape,
-        dp_size,
+        per_act_token_quant=per_act_token_quant,
    )
    experts = BatchedTritonExperts(max_num_tokens=max_num_tokens,
                                   world_size=world_size,
                                   dp_size=dp_size,
                                   use_fp8_w8a8=qtype == torch.float8_e4m3fn,
                                   block_shape=block_shape)
    fused_experts = FusedMoEModularKernel(
        prepare_finalize,
        experts,
    )
    # Note: workers with the same dp_rank must use the exact same inputs.
-    a_chunk = chunk_by_rank(a, rank, world_size).to(device)
+    a_chunk = chunk_by_rank(a, rank, world_size)
-    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
+    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size)
-    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
+    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size)
    # Chunking weights like this only works for batched format
-    w1_chunk = chunk_by_rank(w1, rank, world_size).to(device)
+    w1_chunk = chunk_by_rank(w1, rank, world_size)
-    w2_chunk = chunk_by_rank(w2, rank, world_size).to(device)
+    w2_chunk = chunk_by_rank(w2, rank, world_size)
-
+    w1_scale_chunk = maybe_chunk_by_rank(w1_scale, rank, world_size)
-    if w1_scale is not None:
+    w2_scale_chunk = maybe_chunk_by_rank(w2_scale, rank, world_size)
-        w1_scale_chunk = chunk_by_rank(w1_scale, rank, world_size).to(device)
+    a1_scale_chunk = chunk_scales_by_rank(a1_scale, rank, world_size)
-        w2_scale_chunk = chunk_by_rank(w2_scale, rank, world_size).to(device)
+    a2_scale_chunk = chunk_scales_by_rank(a2_scale, rank, world_size)
    else:
        w1_scale_chunk = None
        w2_scale_chunk = None
    # Note: for now use_compile will error out if the problem size is
    # large enough to trigger chunking. I'm leaving the flag and
@@ -468,6 +570,8 @@ def pplx_moe(
                         chunk_topk_ids,
                         w1_scale=w1_scale_chunk,
                         w2_scale=w2_scale_chunk,
                         a1_scale=a1_scale_chunk,
                         a2_scale=a2_scale_chunk,
                         global_num_experts=num_experts)
    if use_cudagraphs:
@@ -482,6 +586,8 @@ def pplx_moe(
                                 chunk_topk_ids,
                                 w1_scale=w1_scale_chunk,
                                 w2_scale=w2_scale_chunk,
                                 a1_scale=a1_scale_chunk,
                                 a2_scale=a2_scale_chunk,
                                 global_num_experts=num_experts)
        torch.cuda.synchronize()
@@ -494,48 +600,6 @@ def pplx_moe(
    return out
 def _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids):
    assert torch.cuda.current_device() == pgi.local_rank
    num_experts = w1.shape[0]
    device = pgi.device
    rank = pgi.rank
    world_size = pgi.world_size
    max_num_tokens = rank_chunk(a.shape[0], 0, world_size)
    prepare_finalize = BatchedPrepareAndFinalize(
        max_num_tokens=max_num_tokens,
        world_size=world_size,
        dp_size=dp_size,
        rank=rank,
    )
    experts = NaiveBatchedExperts(max_num_tokens=a.shape[0],
                                  world_size=1,
                                  dp_size=1)
    fused_experts = FusedMoEModularKernel(
        prepare_finalize,
        experts,
    )
    # Note: workers with the same dp_rank must use the exact same inputs.
    a_chunk = chunk_by_rank(a, rank, world_size).to(device)
    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
    out = fused_experts(
        a_chunk,
        # Chunking weights like this only works for batched format
        chunk_by_rank(w1, rank, world_size).to(device),
        chunk_by_rank(w2, rank, world_size).to(device),
        chunk_topk_weight,
        chunk_topk_ids,
        global_num_experts=num_experts)
    return out
 def _pplx_moe(
    pgi: ProcessGroupInfo,
    dp_size: int,
@@ -544,75 +608,130 @@ def _pplx_moe(
    w2: torch.Tensor,
    score: torch.Tensor,
    topk: int,
    num_experts: int,
    w1_s: Optional[torch.Tensor] = None,
    w2_s: Optional[torch.Tensor] = None,
-    qtype: Optional[torch.dtype] = None,
+    quant_dtype: Optional[torch.dtype] = None,
    per_act_token_quant: bool = False,
    block_shape: Optional[list[int]] = None,
    use_internode: bool = False,
 ):
-    if use_internode:
+    try:
-        uid = nvshmem_get_unique_id(
+        if use_internode:
-        ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+            uid = nvshmem_get_unique_id(
-        torch.distributed.broadcast(uid, src=0)
+            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
-        nvshmem_init(uid, pgi.rank, pgi.world_size)
+            torch.distributed.broadcast(uid, src=0)
-        group_name = None
+            nvshmem_init(uid, pgi.rank, pgi.world_size)
-    else:
+            group_name = None
-        group_ranks = list(range(pgi.world_size))
+        else:
-        cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
+            group_ranks = list(range(pgi.world_size))
-        group_name = cpu_group.group_name
+            cpu_group = torch.distributed.new_group(group_ranks,
                                                    backend="gloo")
            group_name = cpu_group.group_name
-    m, k = a.shape
+        m, k = a.shape
-    e, _, n = w2.shape
+        e, _, n = w2.shape
-    moe_config = get_default_config(m, e, n, k, topk, a.dtype, False)
+        moe_config = get_default_config(m, e, n, k, topk, a.dtype, False)
-    device = torch.device("cuda", pgi.rank)
+        device = torch.device("cuda", pgi.rank)
-    a = a.to(device)
+        rank = pgi.rank
-    w1 = w1.to(device)
+        world_size = pgi.world_size
    w2 = w2.to(device)
    w1_s = w1_s.to(device) if w1_s is not None else None
    w2_s = w2_s.to(device) if w2_s is not None else None
-    with set_current_vllm_config(vllm_config), override_config(moe_config):
+        a = a.to(device)
-        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
+        w1 = w1.to(device)
-        torch_output = torch_experts(a,
+        w2 = w2.to(device)
-                                     w1,
+        w1_s = w1_s.to(device) if w1_s is not None else None
-                                     w2,
+        w2_s = w2_s.to(device) if w2_s is not None else None
                                     topk_weight,
                                     topk_ids,
                                     w1_scale=w1_s,
                                     w2_scale=w2_s,
                                     quant_dtype=qtype,
                                     per_act_token_quant=per_act_token_quant,
                                     block_shape=block_shape)
        pplx_output = pplx_moe(group_name, pgi.rank, pgi.world_size, dp_size,
                               a, w1, w2, topk_weight, topk_ids, w1_s, w2_s,
                               qtype, per_act_token_quant, block_shape)
        # TODO (bnell): fix + re-enable
        #batched_output = _batched_moe(pgi, dp_size, a, w1, w2, topk_weight,
        #                              topk_ids)
-    torch_output = chunk_by_rank(torch_output, pgi.rank,
+        if (quant_dtype is not None and not per_act_token_quant
-                                 pgi.world_size).to(pplx_output.device)
+                and block_shape is None):
            a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
            a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
        else:
            a1_scale = None
            a2_scale = None
-    torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0)
+        with set_current_vllm_config(vllm_config), override_config(moe_config):
-    #torch.testing.assert_close(batched_output, torch_output, atol=2e-2, rtol=0)
+            topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-    if use_internode:
+            torch_output = torch_experts(
-        nvshmem_finalize()
+                a,
                w1,
                w2,
                topk_weight,
                topk_ids,
                w1_scale=w1_s,
                w2_scale=w2_s,
                a1_scale=a1_scale,
                a2_scale=a2_scale,
                quant_dtype=quant_dtype,
                per_act_token_quant=per_act_token_quant,
                block_shape=block_shape,
            )
            batched_output = naive_batched_moe(
                a,
                w1,
                w2,
                topk_weight,
                topk_ids,
                w1_scale=w1_s,
                w2_scale=w2_s,
                a1_scale=a1_scale,
                a2_scale=a2_scale,
                quant_dtype=quant_dtype,
                per_act_token_quant=per_act_token_quant,
                block_shape=block_shape,
            )
            pplx_output = pplx_moe(
                group_name,
                rank,
                world_size,
                dp_size,
                a,
                w1,
                w2,
                topk_weight,
                topk_ids,
                w1_scale=w1_s,
                w2_scale=w2_s,
                a1_scale=a1_scale,
                a2_scale=a2_scale,
                quant_dtype=quant_dtype,
                per_act_token_quant=per_act_token_quant,
                block_shape=block_shape,
            )
        chunked_batch_output = chunk_by_rank(
            batched_output, pgi.rank, pgi.world_size).to(pplx_output.device)
        torch.testing.assert_close(batched_output,
                                   torch_output,
                                   atol=3e-2,
                                   rtol=3e-2)
        torch.testing.assert_close(pplx_output,
                                   chunked_batch_output,
                                   atol=3e-2,
                                   rtol=3e-2)
    finally:
        if use_internode:
            nvshmem_finalize()
-@pytest.mark.parametrize("mnk", PPLX_MOE_COMBOS)
+@pytest.mark.parametrize("mnk", PPLX_COMBOS)
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
@pytest.mark.parametrize("per_act_token_quant", [False, True])
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
@pytest.mark.parametrize("use_internode", [False])
@pytest.mark.optional
@requires_pplx
-def test_pplx_moe(
+def test_pplx_moe_slow(
    mnk: tuple[int, int, int],
    e: int,
    topk: int,
@@ -633,18 +752,143 @@ def test_pplx_moe(
        use_fp8_w8a8 = False
        quant_dtype = None
-    if not use_fp8_w8a8 and per_act_token_quant and block_shape is not None:
+    if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
        pytest.skip("Skip quantization test for non-quantized type")
    if per_act_token_quant and block_shape is not None:
        pytest.skip("Skip illegal quantization combination")
    a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
    score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
-    _, w1, w1_s, _, w2, w2_s = make_test_weights(e,
+    _, w1, w1_s, _, w2, w2_s = make_test_weights(
-                                                 n,
+        e,
-                                                 k,
+        n,
-                                                 quant_dtype=quant_dtype,
+        k,
-                                                 block_shape=block_shape)
+        quant_dtype=quant_dtype,
        block_shape=block_shape,
        per_act_token_quant=per_act_token_quant,
    )
-    parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk,
+    parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk, e,
                    w1_s, w2_s, quant_dtype, per_act_token_quant, block_shape,
                    use_internode)
 def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
                    make_weights: bool, test_fn: Callable):
    def format_result(msg, ex=None):
        if ex is not None:
            x = str(ex)
            newx = x.strip(" \n\t")[:16]
            if len(newx) < len(x):
                newx = newx + " ..."
            prefix = "E\t"
            print(f"{textwrap.indent(traceback.format_exc(), prefix)}")
            print(f"FAILED {msg} - {newx}\n")
        else:
            print(f"PASSED {msg}")
    current_platform.seed_everything(7)
    combos = itertools.product(PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES,
                               [False, True], [None, [128, 128]])
    exceptions = []
    count = 0
    for mnk, e, topk, dtype, per_act_token_quant, block_shape in combos:
        count = count + 1
        m, n, k = mnk
        if dtype == torch.float8_e4m3fn:
            use_fp8_w8a8 = True
            quant_dtype = dtype
        else:
            use_fp8_w8a8 = False
            quant_dtype = None
        test_desc = (f"test_pplx_moe[mnk={mnk}, e={e}, topk={topk}, "
                     f"dtype={dtype}, per_act_token={per_act_token_quant}, "
                     f"block_shape={block_shape}")
        if not use_fp8_w8a8 and (per_act_token_quant
                                 or block_shape is not None):
            print(
                f"{test_desc} - Skip quantization test for non-quantized type."
            )
            continue
        if per_act_token_quant and block_shape is not None:
            print(f"{test_desc} - Skip illegal quantization combination.")
            continue
        a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
        score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
        args = dict()
        if make_weights:
            _, w1, w1_s, _, w2, w2_s = make_test_weights(
                e,
                n,
                k,
                quant_dtype=quant_dtype,
                block_shape=block_shape,
                per_act_token_quant=per_act_token_quant,
            )
            args["w1"] = w1
            args["w2"] = w2
            args["w1_s"] = w1_s
            args["w2_s"] = w2_s
        try:
            test_fn(
                pgi=pgi,
                dp_size=dp_size,
                a=a,
                score=score,
                topk=topk,
                num_experts=e,
                quant_dtype=quant_dtype,
                per_act_token_quant=per_act_token_quant,
                block_shape=block_shape,
                use_internode=use_internode,
                **args,
            )
            format_result(test_desc)
        except Exception as ex:
            format_result(test_desc, ex)
            exceptions.append(ex)
    if len(exceptions) > 0:
        raise RuntimeError(
            f"{len(exceptions)} of {count} tests failed in child process, "
            f"rank={pgi.rank}.")
    else:
        print(f"{count} of {count} tests passed in child process, "
              f"rank={pgi.rank}.")
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
@pytest.mark.parametrize("use_internode", [False])
@requires_pplx
 def test_pplx_prepare_finalize(
    world_dp_size: tuple[int, int],
    use_internode: bool,
 ):
    current_platform.seed_everything(7)
    world_size, dp_size = world_dp_size
    parallel_launch(world_size * dp_size, _pplx_test_loop, dp_size,
                    use_internode, False, _pplx_prepare_finalize)
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
@pytest.mark.parametrize("use_internode", [False])
@requires_pplx
 def test_pplx_moe(
    world_dp_size: tuple[int, int],
    use_internode: bool,
 ):
    current_platform.seed_everything(7)
    world_size, dp_size = world_dp_size
    parallel_launch(world_size, _pplx_test_loop, dp_size, use_internode, True,
                    _pplx_moe)
--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@@ -63,13 +63,12 @@ def batched_moe(
    fused_experts = FusedMoEModularKernel(
        BatchedPrepareAndFinalize(max_num_tokens,
-                                  world_size=1,
+                                  num_dispatchers=1,
-                                  dp_size=1,
+                                  num_local_experts=w1.shape[0],
                                  rank=0),
        BatchedTritonExperts(
            max_num_tokens=max_num_tokens,
-            world_size=1,
+            num_dispatchers=1,
            dp_size=1,
            use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
            per_act_token_quant=per_act_token_quant,
            block_shape=block_shape,
@@ -105,13 +104,12 @@ def naive_batched_moe(
    fused_experts = FusedMoEModularKernel(
        BatchedPrepareAndFinalize(max_num_tokens,
-                                  world_size=1,
+                                  num_dispatchers=1,
-                                  dp_size=1,
+                                  num_local_experts=w1.shape[0],
                                  rank=0),
        NaiveBatchedExperts(
            max_num_tokens=max_num_tokens,
-            dp_size=1,
+            num_dispatchers=1,
            world_size=1,
            use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
            per_act_token_quant=per_act_token_quant,
            block_shape=block_shape,
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -277,6 +277,24 @@ def dequant(
        return t.to(out_dtype)
 def batched_dequant(
    t: torch.Tensor,
    scale: Optional[torch.Tensor],
    block_shape: Optional[list[int]],
    per_act_token_quant: bool,
    out_dtype: Optional[torch.dtype] = torch.float32,
 ) -> torch.Tensor:
    if scale is not None:
        assert t.shape[0] == scale.shape[0]
        out = torch.empty_like(t, dtype=out_dtype)
        for e in range(t.shape[0]):
            out[e] = dequant(t[e], scale[e], block_shape, per_act_token_quant,
                             out_dtype)
        return out
    return t.to(out_dtype)
 def native_batched_masked_quant_matmul(
    A: torch.Tensor,
    B: torch.Tensor,
--- a/tests/kernels/test_apply_repetition_penalties.py
+++ b/tests/kernels/test_apply_repetition_penalties.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
@@ -74,3 +75,51 @@ def test_apply_repetition_penalties(
    # Test the operator by applying the opcheck utility
    opcheck(torch.ops._C.apply_repetition_penalties_,
            (logits.clone(), prompt_mask, output_mask, repetition_penalties))
@pytest.mark.skipif(not current_platform.is_cuda(),
                    reason="This test for checking CUDA kernel")
@torch.inference_mode()
 def test_apply_repetition_penalties_zero_seqs() -> None:
    """
    Test the apply_repetition_penalties custom op with num_seqs=0
    against a reference implementation.
    """
    num_seqs = 0
    vocab_size = 17
    repetition_penalty = 1.05
    dtype = torch.float32
    seed = 0
    current_platform.seed_everything(seed)
    torch.set_default_device("cuda:0")
    # Create test data
    logits = torch.randn(num_seqs, vocab_size, dtype=dtype)
    # Create masks with some random tokens marked as repeated
    prompt_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
    output_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
    # No tokens to mark as repeated since num_seqs=0
    # Create repetition penalties tensor
    repetition_penalties = torch.full((num_seqs, ),
                                      repetition_penalty,
                                      dtype=dtype)
    # Run all three implementations
    logits_torch = logits.clone()
    logits_cuda = logits.clone()
    apply_repetition_penalties_torch(logits_torch, prompt_mask, output_mask,
                                     repetition_penalties)
    apply_repetition_penalties_cuda(logits_cuda, prompt_mask, output_mask,
                                    repetition_penalties)
    # Compare all outputs to reference
    torch.testing.assert_close(logits_torch, logits_cuda, rtol=1e-3, atol=1e-3)
    # Test the operator by applying the opcheck utility
    opcheck(torch.ops._C.apply_repetition_penalties_,
            (logits.clone(), prompt_mask, output_mask, repetition_penalties))
--- a/tests/kernels/test_flex_attention.py
+++ b/tests/kernels/test_flex_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Integration tests for FlexAttention backend vs default backend"""
 import random
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -1094,6 +1094,8 @@ def torch_experts(
    if expert_map is not None:
        topk_ids = expert_map[topk_ids]
    f32 = torch.float32
    for i in range(num_experts):
        mask = topk_ids == i
        if mask.sum():
@@ -1109,7 +1111,8 @@ def torch_experts(
                                                out.dtype)
                tmp2 = SiluAndMul()(tmp1)
                tmp2, b_scale = moe_kernel_quantize_input(
-                    tmp2, None, quant_dtype, per_act_token_quant, block_shape)
+                    tmp2, a2_scale, quant_dtype, per_act_token_quant,
                    block_shape)
                out[mask] = native_w8a8_block_matmul(tmp2, w2[i], b_scale,
                                                     w2_scale[i], block_shape,
@@ -1117,7 +1120,6 @@ def torch_experts(
            else:
                assert (a_scale is not None and w1_scale is not None
                        and w2_scale is not None)
                f32 = torch.float32
                scales = a_scale if a_scale.numel() == 1 else a_scale[mask]
                tmp1 = a[mask].to(f32) * scales
                w1_dq = (w1[i].to(f32) * w1_scale[i]).transpose(0, 1)
@@ -1126,8 +1128,8 @@ def torch_experts(
                w2_dq = (w2[i].to(f32) * w2_scale[i]).transpose(0, 1)
                out[mask] = (tmp2 @ w2_dq).to(out.dtype)
-    return (out.view(M, -1, w2.shape[1]) *
+    return (out.view(M, -1, w2.shape[1]).to(f32) *
-            topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1)
+            topk_weight.view(M, -1, 1)).sum(dim=1).to(out.dtype)
 def torch_moe(a: torch.Tensor,
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -249,23 +249,6 @@ def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
           model_runner.model)
@pytest.fixture(params=[True, False])
 def run_with_both_engines_lora(request, monkeypatch):
    # Automatically runs tests twice, once with V1 and once without
    use_v1 = request.param
    # Tests decorated with `@skip_v1` are only run without v1
    skip_v1 = request.node.get_closest_marker("skip_v1")
    if use_v1:
        if skip_v1:
            pytest.skip("Skipping test on vllm V1")
        monkeypatch.setenv('VLLM_USE_V1', '1')
    else:
        monkeypatch.setenv('VLLM_USE_V1', '0')
    yield
@pytest.fixture
 def reset_default_device():
    """
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -3,6 +3,7 @@
 import pytest
 from tests.models.registry import HF_EXAMPLE_MODELS
 from tests.utils import multi_gpu_test
 from vllm.engine.arg_utils import EngineArgs
 from vllm.sampling_params import SamplingParams
@@ -19,31 +20,55 @@ pytestmark = pytest.mark.hybrid_model
 SSM_MODELS = [
    "state-spaces/mamba-130m-hf",
    "tiiuae/falcon-mamba-tiny-dev",
    # TODO: Compare to a Mamba2 model. The HF transformers implementation of
    # Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
    # doesn't compare vLLM output with HF output.
    # See https://github.com/huggingface/transformers/pull/35943
    "mistralai/Mamba-Codestral-7B-v0.1",
 ]
 HYBRID_MODELS = [
    "ai21labs/Jamba-tiny-dev",
    # NOTE: Currently the test failes due to HF transformers issue fixed in:
    # https://github.com/huggingface/transformers/pull/39033
    # We will enable vLLM test for Granite after next HF transformers release.
    # "ibm-granite/granite-4.0-tiny-preview",
    # NOTE: Running Plamo2 in transformers implementation requires to install
    # causal-conv1d package, which is not listed as a test dependency as it's
    # not compatible with pip-compile.
    "pfnet/plamo-2-1b",
    "Zyphra/Zamba2-1.2B-instruct",
    "hmellor/tiny-random-BambaForCausalLM",
    "ibm-ai-platform/Bamba-9B-v1",
    "nvidia/Nemotron-H-8B-Base-8K",
    "ibm-granite/granite-4.0-tiny-preview",
    "tiiuae/Falcon-H1-0.5B-Base",
 ]
 HF_UNSUPPORTED_MODELS = [
    # The HF transformers implementation of
    # Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
    # doesn't compare vLLM output with HF output.
    # See https://github.com/huggingface/transformers/pull/35943
    "mistralai/Mamba-Codestral-7B-v0.1",
    # Note: I'm not seeing the same output from vLLM V0 vs. HF transformers
    # for Nemotron-H-8B; currently only compare vLLM V0 vs. vLLM V1
    "nvidia/Nemotron-H-8B-Base-8K",
    # NOTE: Currently the test fails due to HF transformers issue fixed in:
    # https://github.com/huggingface/transformers/pull/39033
    # We will enable vLLM test for Granite after next HF transformers release.
    "ibm-granite/granite-4.0-tiny-preview",
 ]
 V1_SUPPORTED_MODELS = [
    "mistralai/Mamba-Codestral-7B-v0.1",
    "ibm-ai-platform/Bamba-9B-v1",
    "Zyphra/Zamba2-1.2B-instruct",
    "nvidia/Nemotron-H-8B-Base-8K",
    "ibm-granite/granite-4.0-tiny-preview",
    "tiiuae/Falcon-H1-0.5B-Base",
 ]
 ATTN_BLOCK_SIZES = {
    "ibm-ai-platform/Bamba-9B-v1": 528,
    "Zyphra/Zamba2-1.2B-instruct": 80,
    "nvidia/Nemotron-H-8B-Base-8K": 528,
    "ibm-granite/granite-4.0-tiny-preview": 400,
    "tiiuae/Falcon-H1-0.5B-Base": 800,
 }
 # Avoid OOM
 MAX_NUM_SEQS = 4
@@ -60,8 +85,16 @@ def test_models(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
    try:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
        model_info.check_available_online(on_fail="skip")
        model_info.check_transformers_version(on_fail="skip")
    except ValueError:
        pass
    with hf_runner(model) as hf_model:
-        if model != "mistralai/Mamba-Codestral-7B-v0.1":
+        if model not in HF_UNSUPPORTED_MODELS:
            hf_outputs = hf_model.generate_greedy_logprobs_limit(
                example_prompts, max_tokens, num_logprobs)
        else:
@@ -72,12 +105,21 @@ def test_models(
            example_prompts, max_tokens, num_logprobs)
    if model in V1_SUPPORTED_MODELS:
        if model in HYBRID_MODELS and model in ATTN_BLOCK_SIZES:
            block_size = ATTN_BLOCK_SIZES[model]
        else:
            block_size = 16
        with monkeypatch.context() as m:
            m.setenv("VLLM_USE_V1", "1")
            if model in HYBRID_MODELS:
                # required due to reorder_batch behaviour
                m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
            with vllm_runner(model,
                             max_num_seqs=MAX_NUM_SEQS,
                             enforce_eager=True,
-                             enable_prefix_caching=False) as vllm_model:
+                             enable_prefix_caching=False,
                             block_size=block_size) as vllm_model:
                vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
                    example_prompts, max_tokens, num_logprobs)
    else:
@@ -111,6 +153,14 @@ def test_batching(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
    try:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
        model_info.check_available_online(on_fail="skip")
        model_info.check_transformers_version(on_fail="skip")
    except ValueError:
        pass
    for_loop_outputs = []
    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
        for prompt in example_prompts:
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from typing import Optional
 import pytest
@@ -74,6 +75,13 @@ def test_models(
        vllm_extra_kwargs["override_pooler_config"] = \
            PoolerConfig(pooling_type="MEAN", normalize=False)
    max_model_len: Optional[int] = 512
    if model in [
            "sentence-transformers/all-MiniLM-L12-v2",
            "sentence-transformers/stsb-roberta-base-v2"
    ]:
        max_model_len = None
    # The example_prompts has ending "\n", for example:
    # "Write a short story about a robot that dreams for the first time.\n"
    # sentence_transformers will strip the input texts, see:
@@ -87,7 +95,7 @@ def test_models(
    with vllm_runner(model,
                     task="embed",
-                     max_model_len=512,
+                     max_model_len=max_model_len,
                     **vllm_extra_kwargs) as vllm_model:
        vllm_outputs = vllm_model.embed(example_prompts)
--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@@ -56,10 +56,16 @@ MODELS = [
                   enable_test=False),
 ]
 V1FlashAttentionImpNotSupported = [
    "Alibaba-NLP/gte-Qwen2-1.5B-instruct", "Alibaba-NLP/gte-modernbert-base"
 ]
@pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner,
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo,
-                           model_info: EmbedModelInfo) -> None:
+                           monkeypatch) -> None:
    if model_info.name in V1FlashAttentionImpNotSupported:
        monkeypatch.setenv("VLLM_USE_V1", "0")
    vllm_extra_kwargs: dict[str, Any] = {}
    if model_info.architecture == "GteNewModel":
@@ -71,8 +77,10 @@ def test_embed_models_mteb(hf_runner, vllm_runner,
@pytest.mark.parametrize("model_info", MODELS)
 def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo,
+                                  model_info: EmbedModelInfo, example_prompts,
-                                  example_prompts) -> None:
+                                  monkeypatch) -> None:
    if model_info.name in V1FlashAttentionImpNotSupported:
        monkeypatch.setenv("VLLM_USE_V1", "0")
    vllm_extra_kwargs: dict[str, Any] = {}
    if model_info.architecture == "GteNewModel":
--- a/tests/models/language/pooling/test_intfloat.py
+++ b/tests/models/language/pooling/test_intfloat.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 from ...utils import EmbedModelInfo
--- a/tests/models/language/pooling/test_mxbai_rerank.py
+++ b/tests/models/language/pooling/test_mxbai_rerank.py
@@ -0,0 +1,84 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any
 import pytest
 import torch
 from tests.conftest import HfRunner
 from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
 RERANK_MODELS = [
    RerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
                    architecture="Qwen2ForSequenceClassification",
                    dtype="float32",
                    enable_test=True),
    RerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
                    architecture="Qwen2ForSequenceClassification",
                    dtype="float32",
                    enable_test=False)
 ]
 class MxbaiRerankerHfRunner(HfRunner):
    def __init__(self,
                 model_name: str,
                 dtype: str = "auto",
                 *args: Any,
                 **kwargs: Any) -> None:
        from transformers import AutoModelForCausalLM, AutoTokenizer
        super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name,
                                                       padding_side='left')
        self.yes_loc = self.tokenizer.convert_tokens_to_ids("1")
        self.no_loc = self.tokenizer.convert_tokens_to_ids("0")
    def predict(self, prompts: list[list[str]], *args,
                **kwargs) -> torch.Tensor:
        def process_inputs(pairs):
            inputs = self.tokenizer(pairs,
                                    padding=False,
                                    truncation='longest_first',
                                    return_attention_mask=False)
            for i, ele in enumerate(inputs['input_ids']):
                inputs['input_ids'][i] = ele
            inputs = self.tokenizer.pad(inputs,
                                        padding=True,
                                        return_tensors="pt")
            for key in inputs:
                inputs[key] = inputs[key].to(self.model.device)
            return inputs
        @torch.no_grad()
        def compute_logits(inputs):
            logits = self.model(**inputs).logits[:, -1, :]
            yes_logits = logits[:, self.yes_loc]
            no_logits = logits[:, self.no_loc]
            logits = yes_logits - no_logits
            scores = logits.float().sigmoid()
            return scores
        scores = []
        for prompt in prompts:
            inputs = process_inputs([prompt])
            score = compute_logits(inputs)
            scores.append(score[0].item())
        return torch.Tensor(scores)
@pytest.mark.parametrize("model_info", RERANK_MODELS)
 def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
    vllm_extra_kwargs: dict[str, Any] = {}
    if model_info.architecture == "Qwen2ForSequenceClassification":
        vllm_extra_kwargs["hf_overrides"] = {
            "architectures": ["Qwen2ForSequenceClassification"],
            "classifier_from_token": ["0", "1"],
            "method": "from_2_way_softmax",
        }
    mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info,
                            vllm_extra_kwargs)
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -33,9 +33,6 @@ if current_platform.is_rocm():
    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
 REQUIRES_V0_MODELS = [
    # V1 Test: no way to fall back for head_dim = 80
    # https://github.com/vllm-project/vllm/issues/14524
    "qwen_vl",
    # V1 Test: not enough KV cache space in C1.
    "fuyu",
 ]
@@ -221,8 +218,7 @@ VLM_TEST_SETTINGS = {
        marks=[large_gpu_mark(min_gb=32)],
    ),
    "blip2": VLMTestInfo(
-        # TODO: Change back to 2.7b once head_dim = 80 is supported
+        models=["Salesforce/blip2-opt-2.7b"],
        models=["Salesforce/blip2-opt-6.7b"],
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
        img_idx_to_prompt=lambda idx: "",
@@ -340,8 +336,7 @@ VLM_TEST_SETTINGS = {
    "h2ovl": VLMTestInfo(
        models = [
            "h2oai/h2ovl-mississippi-800m",
-            # TODO: Re-enable once head_dim = 80 is supported
+            "h2oai/h2ovl-mississippi-2b",
            # "h2oai/h2ovl-mississippi-2b",
        ],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@@ -83,7 +83,7 @@ MODELS = [
    QWEN2_CONFIG,
    PHI3_CONFIG,
    GPT2_CONFIG,
-    # STABLELM_CONFIG,  # enable this when v1 support head_size=80
+    STABLELM_CONFIG,
    DOLPHIN_CONFIG,
    # STARCODER_CONFIG, # broken
 ]
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -169,7 +169,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"),  # noqa: E501
    "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),  # noqa: E501
    "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
-    "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-1.5B-Instruct",
+    "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base",
                                          min_transformers_version="4.53"),
    "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
    "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
@@ -240,8 +240,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat",
                                        trust_remote_code=True),
    "PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"),
-    "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2", v0_only=True),
+    "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"),
    "Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"),
    # Blocksparse attention not supported in V1 yet
    "Phi3SmallForCausalLM": _HfExamplesInfo("microsoft/Phi-3-small-8k-instruct",
                                            trust_remote_code=True,
                                            v0_only=True),
@@ -258,10 +259,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
    "Qwen3ForSequenceClassification": _HfExamplesInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls"),  # noqa: E501
    "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
-    "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b",  # noqa: E501
+    "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"),  # noqa: E501
-                                                v0_only=True),
+    "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
    "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t",
                                           v0_only=True),
    "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
    "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
    "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
@@ -330,8 +329,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
    "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), # noqa: E501
    "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b",  # noqa: E501
-                                                     extras={"6b": "Salesforce/blip2-opt-6.7b"},  # noqa: E501
+                                                     extras={"6b": "Salesforce/blip2-opt-6.7b"}),  # noqa: E501
                                                     v0_only=True),
    "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
    "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny",  # noqa: E501
                                                extras={"fork": "Isotr0py/deepseek-vl2-tiny"},  # noqa: E501
@@ -359,8 +357,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                    trust_remote_code=True),
    "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
                                                      extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},  # noqa: E501
-                                                      trust_remote_code=True,
+                                                      trust_remote_code=True),
                                                      v0_only=True),
    "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct",   # noqa: E501
                                                      max_model_len=10240),
    "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -22,7 +22,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
    model_info.check_transformers_version(on_fail="skip")
    # FIXME: Possible memory leak in the previous tests?
-    if model_arch == "GraniteSpeechForConditionalGeneration":
+    if model_arch in ("GraniteSpeechForConditionalGeneration",
                      "KimiVLForConditionalGeneration"):
        pytest.skip("Avoid OOM")
    # Avoid OOM and reduce initialization time by only using 1 layer
--- a/tests/quantization/test_rtn.py
+++ b/tests/quantization/test_rtn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright © 2025, Oracle and/or its affiliates.
 """Tests RTN quantization startup and generation, 
 doesn't test correctness
--- a/tests/tool_use/test_minimax_tool_parser.py
+++ b/tests/tool_use/test_minimax_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501
 import json
--- a/tests/tool_use/test_xlam_tool_parser.py
+++ b/tests/tool_use/test_xlam_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -9,7 +9,7 @@ import torch
 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                         SchedulerConfig, SpeculativeConfig, VllmConfig)
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@@ -17,6 +17,7 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
 from vllm.v1.structured_output.request import StructuredOutputRequest
 EOS_TOKEN_ID = 50256
@@ -33,6 +34,7 @@ def create_scheduler(
    block_size: int = 16,
    max_model_len: Optional[int] = None,
    num_speculative_tokens: Optional[int] = None,
    skip_tokenizer_init: bool = False,
 ) -> Scheduler:
    '''Create scheduler under test.
@@ -65,6 +67,7 @@ def create_scheduler(
        trust_remote_code=True,
        dtype="float16",
        seed=42,
        skip_tokenizer_init=skip_tokenizer_init,
    )
    # Cache config, optionally force APC
    kwargs_cache = ({} if enable_prefix_caching is None else {
@@ -186,7 +189,7 @@ def test_get_num_unfinished_requests():
 ])
 def test_schedule(enable_prefix_caching: Optional[bool],
                  prompt_logprobs: Optional[int]):
-    '''Test scheduling. 
+    '''Test scheduling.
    Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
    '''
    scheduler = create_scheduler(enable_prefix_caching=enable_prefix_caching)
@@ -1408,7 +1411,7 @@ def create_requests_with_priority(
 def test_priority_scheduling_basic_ordering():
-    """Test that requests are scheduled in priority order 
+    """Test that requests are scheduled in priority order
    (lower value = higher priority)."""
    scheduler = create_scheduler_with_priority()
@@ -1437,7 +1440,7 @@ def test_priority_scheduling_basic_ordering():
 def test_priority_scheduling_arrival_time_tiebreaker():
-    """Test that arrival time is used 
+    """Test that arrival time is used
    as tiebreaker when priorities are equal."""
    scheduler = create_scheduler_with_priority()
@@ -1495,7 +1498,7 @@ def test_priority_scheduling_mixed_priority_and_arrival():
 def test_priority_scheduling_preemption():
-    """Test that priority scheduling preempts 
+    """Test that priority scheduling preempts
    lower priority requests when memory is constrained."""
    # Create scheduler with very limited memory to force preemption
    scheduler = create_scheduler_with_priority(
@@ -1576,7 +1579,7 @@ def test_priority_scheduling_preemption():
 def test_priority_scheduling_no_preemption_when_space_available():
-    """Test that preemption doesn't happen 
+    """Test that preemption doesn't happen
    when there's space for new requests."""
    scheduler = create_scheduler_with_priority(
        max_num_seqs=3,  # Allow 3 concurrent requests
@@ -1626,7 +1629,7 @@ def test_priority_scheduling_no_preemption_when_space_available():
 def test_priority_scheduling_preemption_victim_selection():
-    """Test that the correct victim is selected for 
+    """Test that the correct victim is selected for
    preemption based on priority and arrival time."""
    # This test verifies the priority-based victim selection logic
    # by checking the waiting queue order after adding requests with different
@@ -1743,7 +1746,7 @@ def test_priority_scheduling_waiting_queue_order():
 def test_priority_scheduling_fcfs_fallback():
-    """Test that FCFS behavior is maintained when all 
+    """Test that FCFS behavior is maintained when all
    requests have same priority."""
    scheduler = create_scheduler_with_priority()
@@ -1811,7 +1814,7 @@ def test_priority_scheduling_with_limited_slots():
 def test_priority_scheduling_heap_property():
-    """Test that the waiting queue maintains heap 
+    """Test that the waiting queue maintains heap
    property for priority scheduling."""
    scheduler = create_scheduler_with_priority(
        max_num_seqs=1,  # Only one request can run at a time
@@ -1857,3 +1860,39 @@ def test_priority_scheduling_heap_property():
    # Verify requests were scheduled in priority order (lowest value first)
    expected_priorities = sorted(priorities)
    assert scheduled_priorities == expected_priorities
 def test_schedule_skip_tokenizer_init():
    scheduler = create_scheduler(skip_tokenizer_init=True)
    requests = create_requests(num_requests=5)
    for request in requests:
        scheduler.add_request(request)
    output = scheduler.schedule()
    assert len(output.scheduled_new_reqs) == len(requests)
    assert output.grammar_bitmask is None
 def test_schedule_skip_tokenizer_init_structured_output_request():
    scheduler = create_scheduler(skip_tokenizer_init=True)
    guided_params = GuidedDecodingParams(regex="[0-9]+")
    sampling_params = SamplingParams(
        ignore_eos=False,
        max_tokens=16,
        guided_decoding=guided_params,
    )
    request = Request(
        request_id="0",
        prompt_token_ids=[0, 1],
        multi_modal_inputs=None,
        multi_modal_hashes=None,
        multi_modal_placeholders=None,
        sampling_params=sampling_params,
        pooling_params=None,
        eos_token_id=EOS_TOKEN_ID,
        structured_output_request=StructuredOutputRequest(sampling_params),
    )
    scheduler.add_request(request)
    output = scheduler.schedule()
    assert len(output.scheduled_new_reqs) == 0
    assert len(scheduler.running) == 0
    assert len(scheduler.waiting) == 1
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -1,19 +1,30 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 import random
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
 import pytest
-from vllm import LLM, SamplingParams
+from vllm import LLM
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Metric, Vector
 if TYPE_CHECKING:
    from tests.conftest import VllmRunner
 MODEL = "facebook/opt-125m"
 DTYPE = "half"
-def _vllm_model(apc: bool, vllm_runner, monkeypatch):
+def _vllm_model(
    apc: bool,
    vllm_runner: type[VllmRunner],
    monkeypatch: pytest.MonkeyPatch,
    *,
    skip_tokenizer_init: bool = False,
 ):
    """Set up VllmRunner instance."""
    monkeypatch.setenv("VLLM_USE_V1", "1")
    return vllm_runner(
@@ -23,6 +34,7 @@ def _vllm_model(apc: bool, vllm_runner, monkeypatch):
        enforce_eager=True,
        enable_prefix_caching=apc,
        gpu_memory_utilization=0.5,
        skip_tokenizer_init=skip_tokenizer_init,
    )
@@ -45,9 +57,27 @@ def vllm_model_apc(vllm_runner, monkeypatch):
        yield vllm_model
@pytest.fixture(
    # Function scope decouples tests & allows
    # env var adjustment via monkeypatch
    scope="function",
    # Prefix caching
    params=[False, True])
 def vllm_model_skip_tokenizer_init(vllm_runner, request, monkeypatch):
    """VllmRunner test fixture with APC."""
    with _vllm_model(
            request.param,
            vllm_runner,
            monkeypatch,
            skip_tokenizer_init=True,
    ) as vllm_model:
        yield vllm_model
 def _get_test_sampling_params(
    prompt_list: list[str],
    seed: Optional[int] = 42,
    structured_outputs: bool = False,
 ) -> tuple[list[SamplingParams], list[int]]:
    """Generate random sampling params for a batch."""
@@ -62,14 +92,34 @@ def _get_test_sampling_params(
    n_list = [get_mostly_n_gt1() for _ in range(len(prompt_list))]
    # High temperature to maximize the chance of unique completions
    return [
-        SamplingParams(temperature=0.95, top_p=0.95, n=n, seed=seed)
+        SamplingParams(
-        for n in n_list
+            temperature=0.95,
            top_p=0.95,
            n=n,
            seed=seed,
            guided_decoding=GuidedDecodingParams(
                regex="[0-9]+") if structured_outputs else None,
        ) for n in n_list
    ], n_list
 def test_compatibility_with_skip_tokenizer_init(
    vllm_model_skip_tokenizer_init: VllmRunner,
    example_prompts: list[str],
 ):
    # Case 1: Structured output request should raise an error.
    sampling_params_list, _ = _get_test_sampling_params(
        example_prompts,
        structured_outputs=True,
    )
    model: LLM = vllm_model_skip_tokenizer_init.model
    with pytest.raises(ValueError):
        _ = model.generate(example_prompts, sampling_params_list)
 def test_parallel_sampling(vllm_model, example_prompts) -> None:
    """Test passes if parallel sampling `n>1` yields `n` unique completions.
-    
+
    Args:
      vllm_model: VllmRunner instance under test.
      example_prompt: test fixture providing prompts for testing.
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
 from collections.abc import Callable
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -13,7 +13,6 @@ UNSUPPORTED_MODELS_V1 = [
    "openai/whisper-large-v3",  # transcription
    "facebook/bart-large-cnn",  # encoder decoder
    "state-spaces/mamba-130m-hf",  # mamba1
    "hmellor/tiny-random-BambaForCausalLM",  # hybrid
    "BAAI/bge-m3",  # embedding
 ]
--- a/tests/v1/test_request.py
+++ b/tests/v1/test_request.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.v1.request import RequestStatus
--- a/tests/v1/tpu/test_spmd_model_weight_loading.py
+++ b/tests/v1/tpu/test_spmd_model_weight_loading.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
 import tempfile
--- a/tests/v1/tpu/test_tpu_qkv_linear.py
+++ b/tests/v1/tpu/test_tpu_qkv_linear.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import tempfile
 import numpy as np
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -450,6 +450,7 @@ def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2):
 def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    error_msg = f"{layer_1} must come before the current layer"
@@ -478,6 +479,7 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
 def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    invalid_layer = "model.layers.0.cross_attn.attn"
@@ -506,6 +508,7 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
 def test_init_kv_cache_with_kv_sharing_target_same_as_current():
    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    error_msg = f"{layer_1} cannot be the same as the current layer"
@@ -534,6 +537,7 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current():
 def test_init_kv_cache_without_kv_sharing():
    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    vllm_config = get_vllm_config()
@@ -601,6 +605,7 @@ def test_init_kv_cache_without_kv_sharing():
 def test_init_kv_cache_with_kv_sharing_valid():
    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    vllm_config = get_vllm_config()
--- a/tools/check_pickle_imports.py
+++ b/tools/check_pickle_imports.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 import sys
--- a/tools/check_spdx_header.py
+++ b/tools/check_spdx_header.py
@@ -2,51 +2,146 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import sys
 from enum import Enum
-SPDX_HEADER = (
+
 class SPDXStatus(Enum):
    """SPDX header status enumeration"""
    EMPTY = "empty"  # empty __init__.py
    COMPLETE = "complete"
    MISSING_LICENSE = "missing_license"  # Only has copyright line
    MISSING_COPYRIGHT = "missing_copyright"  # Only has license line
    MISSING_BOTH = "missing_both"  # Completely missing
 FULL_SPDX_HEADER = (
    "# SPDX-License-Identifier: Apache-2.0\n"
    "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project")
-SPDX_HEADER_PREFIX = "# SPDX-License-Identifier:"
+
 LICENSE_LINE = "# SPDX-License-Identifier: Apache-2.0"
 COPYRIGHT_LINE = "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project"  # noqa: E501
-def check_spdx_header(file_path):
+def check_spdx_header_status(file_path):
-    with open(file_path, encoding='UTF-8') as file:
+    """Check SPDX header status of the file"""
    with open(file_path, encoding="UTF-8") as file:
        lines = file.readlines()
        if not lines:
-            # Empty file like __init__.py
+            # Empty file
-            return True
+            return SPDXStatus.EMPTY
-        for line in lines:
+
-            if line.strip().startswith(SPDX_HEADER_PREFIX):
+        # Skip shebang line
-                return True
+        start_idx = 0
-    return False
+        if lines and lines[0].startswith("#!"):
            start_idx = 1
        has_license = False
        has_copyright = False
        # Check all lines for SPDX headers (not just the first two)
        for i in range(start_idx, len(lines)):
            line = lines[i].strip()
            if line == LICENSE_LINE:
                has_license = True
            elif line == COPYRIGHT_LINE:
                has_copyright = True
        # Determine status based on what we found
        if has_license and has_copyright:
            return SPDXStatus.COMPLETE
        elif has_license and not has_copyright:
            # Only has license line
            return SPDXStatus.MISSING_COPYRIGHT
            # Only has copyright line
        elif not has_license and has_copyright:
            return SPDXStatus.MISSING_LICENSE
        else:
            # Completely missing both lines
            return SPDXStatus.MISSING_BOTH
-def add_header(file_path):
+def add_header(file_path, status):
-    with open(file_path, 'r+', encoding='UTF-8') as file:
+    """Add or supplement SPDX header based on status"""
    with open(file_path, "r+", encoding="UTF-8") as file:
        lines = file.readlines()
        file.seek(0, 0)
-        if lines and lines[0].startswith("#!"):
+        file.truncate()
-            file.write(lines[0])
+
-            file.write(SPDX_HEADER + '\n')
+        if status == SPDXStatus.MISSING_BOTH:
-            file.writelines(lines[1:])
+            # Completely missing, add complete header
-        else:
+            if lines and lines[0].startswith("#!"):
-            file.write(SPDX_HEADER + '\n')
+                # Preserve shebang line
                file.write(lines[0])
                file.write(FULL_SPDX_HEADER + "\n")
                file.writelines(lines[1:])
            else:
                # Add header directly
                file.write(FULL_SPDX_HEADER + "\n")
                file.writelines(lines)
        elif status == SPDXStatus.MISSING_COPYRIGHT:
            # Only has license line, need to add copyright line
            # Find the license line and add copyright line after it
            for i, line in enumerate(lines):
                if line.strip() == LICENSE_LINE:
                    # Insert copyright line after license line
                    lines.insert(
                        i + 1,
                        f"{COPYRIGHT_LINE}\n",
                    )
                    break
            file.writelines(lines)
        elif status == SPDXStatus.MISSING_LICENSE:
            # Only has copyright line, need to add license line
            # Find the copyright line and add license line before it
            for i, line in enumerate(lines):
                if line.strip() == COPYRIGHT_LINE:
                    # Insert license line before copyright line
                    lines.insert(i, f"{LICENSE_LINE}\n")
                    break
            file.writelines(lines)
 def main():
-    files_with_missing_header = []
+    """Main function"""
    files_missing_both = []
    files_missing_copyright = []
    files_missing_license = []
    for file_path in sys.argv[1:]:
-        if not check_spdx_header(file_path):
+        status = check_spdx_header_status(file_path)
            files_with_missing_header.append(file_path)
-    if files_with_missing_header:
+        if status == SPDXStatus.MISSING_BOTH:
            files_missing_both.append(file_path)
        elif status == SPDXStatus.MISSING_COPYRIGHT:
            files_missing_copyright.append(file_path)
        elif status == SPDXStatus.MISSING_LICENSE:
            files_missing_license.append(file_path)
        else:
            continue
    # Collect all files that need fixing
    all_files_to_fix = (files_missing_both + files_missing_copyright +
                        files_missing_license)
    if all_files_to_fix:
        print("The following files are missing the SPDX header:")
-        for file_path in files_with_missing_header:
+        if files_missing_both:
-            print(f"  {file_path}")
+            for file_path in files_missing_both:
-            add_header(file_path)
+                print(f"  {file_path}")
                add_header(file_path, SPDXStatus.MISSING_BOTH)
-    sys.exit(1 if files_with_missing_header else 0)
+        if files_missing_copyright:
            for file_path in files_missing_copyright:
                print(f"  {file_path}")
                add_header(file_path, SPDXStatus.MISSING_COPYRIGHT)
        if files_missing_license:
            for file_path in files_missing_license:
                print(f"  {file_path}")
                add_header(file_path, SPDXStatus.MISSING_LICENSE)
    sys.exit(1 if all_files_to_fix else 0)
 if __name__ == "__main__":
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -646,6 +646,20 @@ def cutlass_scaled_mm_supports_fp4(cuda_device_capability: int) -> bool:
    return torch.ops._C.cutlass_scaled_mm_supports_fp4(cuda_device_capability)
 def cutlass_blockwise_scaled_grouped_mm(
    output: torch.Tensor,
    a: torch.Tensor,
    b: torch.Tensor,
    scales_a: torch.Tensor,
    scales_b: torch.Tensor,
    problem_sizes: torch.Tensor,
    expert_offsets: torch.Tensor,
 ):
    torch.ops._C.cutlass_blockwise_scaled_grouped_mm(output, a, b, scales_a,
                                                     scales_b, problem_sizes,
                                                     expert_offsets)
 def cutlass_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor,
                          block_scale_a: torch.Tensor,
                          block_scale_b: torch.Tensor, alpha: torch.Tensor,
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -310,7 +310,8 @@ class MultiHeadAttention(nn.Module):
            # currently, only torch_sdpa is supported on rocm
            self.attn_backend = _Backend.TORCH_SDPA
        else:
-            if backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
+            if backend in (_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1,
                           _Backend.FLEX_ATTENTION):
                backend = _Backend.XFORMERS
            self.attn_backend = backend if backend in {
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -4,7 +4,7 @@
 import os
 from contextlib import contextmanager
 from functools import cache
-from typing import Generator, Optional, Type
+from typing import Generator, Optional, Union
 import torch
@@ -79,6 +79,33 @@ def get_global_forced_attn_backend() -> Optional[_Backend]:
    return forced_attn_backend
 def supports_head_size(
    attn_backend: Union[str, type[AttentionBackend]],
    head_size: int,
 ) -> bool:
    if isinstance(attn_backend, str):
        try:
            attn_backend = resolve_obj_by_qualname(attn_backend)
        except ImportError:
            return False
    assert isinstance(attn_backend, type)
    # TODO: Update the interface once V0 is removed
    if get_supported_head_sizes := getattr(attn_backend,
                                           "get_supported_head_sizes", None):
        return head_size in get_supported_head_sizes()
    if validate_head_size := getattr(attn_backend, "validate_head_size", None):
        try:
            validate_head_size(head_size)
            return True
        except Exception:
            return False
    raise NotImplementedError(f"{attn_backend.__name__} does not support "
                              "head size validation")
 def get_attn_backend(
    head_size: int,
    dtype: torch.dtype,
@@ -87,7 +114,7 @@ def get_attn_backend(
    is_attention_free: bool,
    is_blocksparse: bool = False,
    use_mla: bool = False,
-) -> Type[AttentionBackend]:
+) -> type[AttentionBackend]:
    """Selects which attention backend to use and lazily imports it."""
    # Accessing envs.* behind an @lru_cache decorator can cause the wrong
    # value to be returned from the cache if the value changes between calls.
@@ -115,7 +142,7 @@ def _cached_get_attn_backend(
    is_blocksparse: bool = False,
    use_v1: bool = False,
    use_mla: bool = False,
-) -> Type[AttentionBackend]:
+) -> type[AttentionBackend]:
    if is_blocksparse:
        logger.info("Using BlocksparseFlashAttention backend.")
        from vllm.attention.backends.blocksparse_attn import (
--- a/vllm/compilation/fusion_attn.py
+++ b/vllm/compilation/fusion_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 import torch._inductor.pattern_matcher as pm
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -466,6 +466,9 @@ class ModelConfig:
                    "affect the random state of the Python process that "
                    "launched vLLM.", self.seed)
        # Keep set served_model_name before maybe_model_redirect(self.model)
        self.served_model_name = get_served_model_name(self.model,
                                                       self.served_model_name)
        self.model = maybe_model_redirect(self.model)
        # The tokenizer is consistent with the model by default.
        if self.tokenizer is None:
@@ -609,8 +612,6 @@ class ModelConfig:
        self.original_max_model_len = self.max_model_len
        self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
        self.served_model_name = get_served_model_name(self.model,
                                                       self.served_model_name)
        self.multimodal_config = self._init_multimodal_config()
        if not self.skip_tokenizer_init:
            self._verify_tokenizer_mode()
@@ -1420,7 +1421,7 @@ class ModelConfig:
    @property
    def is_cross_encoder(self) -> bool:
-        return self.registry.is_cross_encoder_model(self.architectures)
+        return self.task == "classify"
    @property
    def use_mla(self) -> bool:
@@ -2318,7 +2319,7 @@ class SchedulerConfig:
        if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len:
            logger.warning(
-                "max_num_batched_tokens (%d) exceeds max_num_seqs"
+                "max_num_batched_tokens (%d) exceeds max_num_seqs "
                "* max_model_len (%d). This may lead to unexpected behavior.",
                self.max_num_batched_tokens,
                self.max_num_seqs * self.max_model_len)
@@ -4762,6 +4763,12 @@ class VllmConfig:
        if cls is not None:
            cls.verify_and_update_config(self)
        if self.model_config.task == "classify":
            # Maybe convert ForCausalLM into ForSequenceClassification model.
            from vllm.model_executor.models.adapters import (
                SequenceClassificationConfig)
            SequenceClassificationConfig.verify_and_update_config(self)
    def __str__(self):
        return (
            f"model={self.model_config.model!r},"
--- a/vllm/distributed/eplb/init.py
+++ b/vllm/distributed/eplb/init.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 '''
 Expert parallelism load balancer (EPLB).
 '''
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Expert parallelism load balancer (EPLB) metrics and states.
--- a/vllm/distributed/eplb/rebalance_algo.py
+++ b/vllm/distributed/eplb/rebalance_algo.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Expert parallelism load balancer (EPLB) for vLLM.
--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 The actual execution of the rearrangement.
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -97,10 +97,10 @@ def get_kv_connector_cache_layout():
    # used for faster transfer.
    vllm_config = get_current_vllm_config()
    kv_config = vllm_config.kv_transfer_config
-    if vllm_config.model_config is None or kv_config is None:
+    if kv_config is not None and vllm_config.model_config is None:
        logger.warning_once("Unable to detect current VLLM config. " \
        "Defaulting to NHD kv cache layout.")
-    else:
+    elif kv_config is not None:
        use_mla = vllm_config.model_config.use_mla
        if not use_mla and kv_config.kv_connector == "NixlConnector":
            logger.info_once("NixlConnector detected. Setting KV cache " \
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import logging
 import os
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import atexit
 import ctypes
--- a/vllm/distributed/tpu_distributed_utils.py
+++ b/vllm/distributed/tpu_distributed_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import OrderedDict
 from typing import Optional
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1393,13 +1393,6 @@ class EngineArgs:
                               recommend_to_remove=False)
            return False
        # Only Fp16 and Bf16 dtypes since we only support FA.
        V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
        if model_config.dtype not in V1_SUPPORTED_DTYPES:
            _raise_or_fallback(feature_name=f"--dtype {model_config.dtype}",
                               recommend_to_remove=False)
            return False
        # No Mamba or Encoder-Decoder so far.
        if not model_config.is_v1_compatible:
            _raise_or_fallback(feature_name=model_config.architectures,
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -28,7 +28,8 @@ from openai.types.chat import (ChatCompletionMessageToolCallParam,
                               ChatCompletionToolMessageParam)
 from openai.types.chat.chat_completion_content_part_input_audio_param import (
    InputAudio)
-from pydantic import TypeAdapter
+from PIL import Image
 from pydantic import BaseModel, ConfigDict, TypeAdapter
 # yapf: enable
 from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast,
                          ProcessorMixin)
@@ -91,6 +92,25 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):
    """The type of the content part."""
 class PILImage(BaseModel):
    """
    A PIL.Image.Image object.
    """
    image_pil: Image.Image
    model_config = ConfigDict(arbitrary_types_allowed=True)
 class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
    """A simpler version of the param that only accepts a PIL image.
    Example:
    {
        "image_pil": ImageAsset('cherry_blossom').pil_image
    }
    """
    image_pil: Required[PILImage]
 class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
    """A simpler version of the param that only accepts a plain image_url.
    This is supported by OpenAI API, although it is not documented.
@@ -129,6 +149,7 @@ ChatCompletionContentPartParam: TypeAlias = Union[
    OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
    ChatCompletionContentPartInputAudioParam,
    ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
    CustomChatCompletionContentPILImageParam,
    CustomChatCompletionContentSimpleImageParam,
    ChatCompletionContentPartImageEmbedsParam,
    CustomChatCompletionContentSimpleAudioParam,
@@ -631,6 +652,10 @@ class BaseMultiModalContentParser(ABC):
                           image_embeds: Union[str, dict[str, str]]) -> None:
        raise NotImplementedError
    @abstractmethod
    def parse_image_pil(self, image_pil: Image.Image) -> None:
        raise NotImplementedError
    @abstractmethod
    def parse_audio(self, audio_url: str) -> None:
        raise NotImplementedError
@@ -677,6 +702,10 @@ class MultiModalContentParser(BaseMultiModalContentParser):
        self._add_placeholder(placeholder)
    def parse_image_pil(self, image_pil: Image.Image) -> None:
        placeholder = self._tracker.add("image", image_pil)
        self._add_placeholder(placeholder)
    def parse_audio(self, audio_url: str) -> None:
        audio = self._connector.fetch_audio(audio_url)
@@ -733,6 +762,13 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
        placeholder = self._tracker.add("image_embeds", future)
        self._add_placeholder(placeholder)
    def parse_image_pil(self, image_pil: Image.Image) -> None:
        future: asyncio.Future[Image.Image] = asyncio.Future()
        future.set_result(image_pil)
        placeholder = self._tracker.add("image", future)
        self._add_placeholder(placeholder)
    def parse_audio(self, audio_url: str) -> None:
        audio_coro = self._connector.fetch_audio_async(audio_url)
@@ -851,12 +887,13 @@ _TextParser = partial(cast, ChatCompletionContentPartTextParam)
 _ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
 _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
 _PILImageParser = partial(cast, CustomChatCompletionContentPILImageParam)
 # Need to validate url objects
 _ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python
 _AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
 _VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python
-_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio]
+_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage]
 # Define a mapping from part types to their corresponding parsing functions.
 MM_PARSER_MAP: dict[
@@ -869,6 +906,7 @@ MM_PARSER_MAP: dict[
    lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
    "image_embeds":
    lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
    "image_pil": lambda part: _PILImageParser(part).get("image_pil", None),
    "audio_url":
    lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
    "input_audio":
@@ -938,7 +976,7 @@ def _parse_chat_message_content_mm_part(
 VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
-                                       "image_embeds",
+                                       "image_embeds", "image_pil",
                                       "audio_url", "input_audio", "video_url")
@@ -1009,6 +1047,10 @@ def _parse_chat_message_content_part(
        else:
            return str_content
    if part_type == "image_pil":
        image_content = cast(Image.Image, content)
        mm_parser.parse_image_pil(image_content)
        return {'type': 'image'} if wrap_dicts else None
    if part_type == "image_url":
        str_content = cast(str, content)
        mm_parser.parse_image(str_content)
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1204,7 +1204,7 @@ class LLM:
        input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]
-        pooling_params = PoolingParams()
+        pooling_params = PoolingParams(use_cross_encoder=True)
        tokenization_kwargs: dict[str, Any] = {}
        _validate_truncation_size(self.llm_engine.model_config.max_model_len,
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -910,6 +910,8 @@ TASK_HANDLERS: dict[str, dict[str, tuple]] = {
 }
 if envs.VLLM_SERVER_DEV_MODE:
    logger.warning("SECURITY WARNING: Development endpoints are enabled! "
                   "This should NOT be used in production!")
    @router.get("/server_info")
    async def show_server_info(raw_request: Request):
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -229,7 +229,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
    logit_bias: Optional[dict[str, float]] = None
    logprobs: Optional[bool] = False
    top_logprobs: Optional[int] = 0
    # TODO(#9845): remove max_tokens when field is removed from OpenAI API
    max_tokens: Optional[int] = Field(
        default=None,
        deprecated=
@@ -433,23 +432,10 @@ class ChatCompletionRequest(OpenAIBaseModel):
    }
    def to_beam_search_params(
-            self,
+            self, max_tokens: int,
-            default_max_tokens: int,
+            default_sampling_params: dict) -> BeamSearchParams:
            default_sampling_params: Optional[dict] = None
    ) -> BeamSearchParams:
        # TODO(#9845): remove max_tokens when field is removed from OpenAI API
        max_tokens = self.max_completion_tokens or self.max_tokens
        if default_sampling_params is None:
            default_sampling_params = {}
        n = self.n if self.n is not None else 1
        # Use minimum of context window, user request & server limit.
        max_tokens = min(
            val for val in (default_max_tokens, max_tokens,
                            default_sampling_params.get("max_tokens", None))
            if val is not None)
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get(
                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
@@ -465,21 +451,10 @@ class ChatCompletionRequest(OpenAIBaseModel):
    def to_sampling_params(
        self,
-        default_max_tokens: int,
+        max_tokens: int,
        logits_processor_pattern: Optional[str],
-        default_sampling_params: Optional[dict] = None,
+        default_sampling_params: dict,
    ) -> SamplingParams:
        # TODO(#9845): remove max_tokens when field is removed from OpenAI API
        max_tokens = self.max_completion_tokens or self.max_tokens
        if default_sampling_params is None:
            default_sampling_params = {}
        # Use minimum of context window, user request & server limit.
        max_tokens = min(
            val for val in (default_max_tokens, max_tokens,
                            default_sampling_params.get("max_tokens", None))
            if val is not None)
        # Default parameters
        if (repetition_penalty := self.repetition_penalty) is None:
@@ -898,22 +873,15 @@ class CompletionRequest(OpenAIBaseModel):
    }
    def to_beam_search_params(
-            self,
+        self,
-            default_max_tokens: int,
+        max_tokens: int,
-            default_sampling_params: Optional[dict] = None
+        default_sampling_params: Optional[dict] = None,
    ) -> BeamSearchParams:
        max_tokens = self.max_tokens
        if default_sampling_params is None:
            default_sampling_params = {}
        n = self.n if self.n is not None else 1
        # Use minimum of context window, user request & server limit.
        max_tokens = min(
            val for val in (default_max_tokens, max_tokens,
                            default_sampling_params.get("max_tokens", None))
            if val is not None)
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get("temperature", 1.0)
@@ -928,21 +896,14 @@ class CompletionRequest(OpenAIBaseModel):
    def to_sampling_params(
        self,
-        default_max_tokens: int,
+        max_tokens: int,
        logits_processor_pattern: Optional[str],
        default_sampling_params: Optional[dict] = None,
    ) -> SamplingParams:
        max_tokens = self.max_tokens
        if default_sampling_params is None:
            default_sampling_params = {}
        # Use minimum of context window, user request & server limit.
        max_tokens = min(
            val for val in (default_max_tokens, max_tokens,
                            default_sampling_params.get("max_tokens", None))
            if val is not None)
        # Default parameters
        if (repetition_penalty := self.repetition_penalty) is None:
            repetition_penalty = default_sampling_params.get(
@@ -1195,8 +1156,9 @@ class ScoreRequest(OpenAIBaseModel):
    # --8<-- [end:score-extra-params]
-    def to_pooling_params(self):
+    def to_pooling_params(self, *, use_cross_encoder: bool = False):
-        return PoolingParams(additional_data=self.additional_data)
+        return PoolingParams(use_cross_encoder=use_cross_encoder,
                             additional_data=self.additional_data)
 class RerankRequest(OpenAIBaseModel):
@@ -1221,8 +1183,9 @@ class RerankRequest(OpenAIBaseModel):
    # --8<-- [end:rerank-extra-params]
-    def to_pooling_params(self):
+    def to_pooling_params(self, *, use_cross_encoder: bool = False):
-        return PoolingParams(additional_data=self.additional_data)
+        return PoolingParams(use_cross_encoder=use_cross_encoder,
                             additional_data=self.additional_data)
 class RerankDocument(BaseModel):
@@ -1813,7 +1776,7 @@ class TranscriptionRequest(OpenAIBaseModel):
            self,
            default_max_tokens: int,
            default_sampling_params: Optional[dict] = None) -> SamplingParams:
-        # TODO(#9845): remove max_tokens when field is removed from OpenAI API
+
        max_tokens = default_max_tokens
        if default_sampling_params is None:
@@ -2029,7 +1992,7 @@ class TranslationRequest(OpenAIBaseModel):
            self,
            default_max_tokens: int,
            default_sampling_params: Optional[dict] = None) -> SamplingParams:
-        # TODO(#9845): remove max_tokens when field is removed from OpenAI API
+
        max_tokens = default_max_tokens
        if default_sampling_params is None:
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -34,6 +34,7 @@ from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
    MistralToolCall)
 from vllm.entrypoints.utils import get_max_tokens
 from vllm.logger import init_logger
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
@@ -233,15 +234,22 @@ class OpenAIServingChat(OpenAIServing):
        try:
            for i, engine_prompt in enumerate(engine_prompts):
                sampling_params: Union[SamplingParams, BeamSearchParams]
-                default_max_tokens = self.max_model_len - len(
+
-                    engine_prompt["prompt_token_ids"])
+                if self.default_sampling_params is None:
                    self.default_sampling_params = {}
                max_tokens = get_max_tokens(
                    max_model_len=self.max_model_len,
                    request=request,
                    input_length=len(engine_prompt["prompt_token_ids"]),
                    default_sampling_params=self.default_sampling_params)
                if request.use_beam_search:
                    sampling_params = request.to_beam_search_params(
-                        default_max_tokens, self.default_sampling_params)
+                        max_tokens, self.default_sampling_params)
                else:
                    sampling_params = request.to_sampling_params(
-                        default_max_tokens,
+                        max_tokens, self.model_config.logits_processor_pattern,
                        self.model_config.logits_processor_pattern,
                        self.default_sampling_params)
                self._log_inputs(request_id,
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -33,6 +33,7 @@ from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
                                                    is_text_tokens_prompt)
 # yapf: enable
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.utils import get_max_tokens
 from vllm.inputs.data import (EmbedsPrompt, TokensPrompt, is_embeds_prompt,
                              is_tokens_prompt)
 from vllm.logger import init_logger
@@ -160,15 +161,22 @@ class OpenAIServingCompletion(OpenAIServing):
                    input_length = len(engine_prompt["prompt_token_ids"])
                else:
                    assert_never(engine_prompt)
-                default_max_tokens = self.max_model_len - input_length
+
                if self.default_sampling_params is None:
                    self.default_sampling_params = {}
                max_tokens = get_max_tokens(
                    max_model_len=self.max_model_len,
                    request=request,
                    input_length=input_length,
                    default_sampling_params=self.default_sampling_params)
                if request.use_beam_search:
                    sampling_params = request.to_beam_search_params(
-                        default_max_tokens, self.default_sampling_params)
+                        max_tokens, self.default_sampling_params)
                else:
                    sampling_params = request.to_sampling_params(
-                        default_max_tokens,
+                        max_tokens, self.model_config.logits_processor_pattern,
                        self.model_config.logits_processor_pattern,
                        self.default_sampling_params)
                request_id_item = f"{request_id}-{i}"
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -25,9 +25,7 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
                                               PreTrainedTokenizer,
                                               PreTrainedTokenizerFast)
 from vllm.utils import make_async, merge_async_iterators
 logger = init_logger(__name__)
@@ -50,7 +48,7 @@ class ServingScores(OpenAIServing):
    async def _embedding_score(
        self,
-        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+        tokenizer: AnyTokenizer,
        texts_1: list[str],
        texts_2: list[str],
        request: Union[RerankRequest, ScoreRequest],
@@ -141,7 +139,7 @@ class ServingScores(OpenAIServing):
    async def _cross_encoding_score(
        self,
-        tokenizer: Union[AnyTokenizer],
+        tokenizer: AnyTokenizer,
        texts_1: list[str],
        texts_2: list[str],
        request: Union[RerankRequest, ScoreRequest],
@@ -190,7 +188,7 @@ class ServingScores(OpenAIServing):
        # Schedule the request and get the result generator.
        generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
-        pooling_params = request.to_pooling_params()
+        pooling_params = request.to_pooling_params(use_cross_encoder=True)
        for i, engine_prompt in enumerate(engine_prompts):
            request_id_item = f"{request_id}-{i}"
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
@@ -6,6 +6,7 @@ from typing import Union
 import regex as re
 from vllm.entrypoints.chat_utils import random_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaFunctionCall, DeltaMessage,
                                              DeltaToolCall,
@@ -15,7 +16,6 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
    ToolParser, ToolParserManager)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import random_uuid
 logger = init_logger(__name__)
@@ -267,7 +267,7 @@ class DeepSeekV3ToolParser(ToolParser):
                        DeltaToolCall(
                            index=self.current_tool_id,
                            type="function",
-                            id=f"chatcmpl-tool-{random_uuid()}",
+                            id=random_tool_call_id(),
                            function=DeltaFunctionCall(
                                name=function_name).model_dump(
                                    exclude_none=True),
--- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
@@ -1,11 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa
 import json
 from collections.abc import Sequence
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Optional, Union
 import regex as re
 from vllm.entrypoints.chat_utils import random_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaFunctionCall, DeltaMessage,
                                              DeltaToolCall,
@@ -224,7 +226,7 @@ class xLAMToolParser(ToolParser):
                        function_name = name_match.group(1)
                        # The test expects us to send just the name first
-                        tool_id = f"chatcmpl-tool-{random_uuid()}"
+                        tool_id = random_tool_call_id()
                        delta = DeltaMessage(tool_calls=[
                            DeltaToolCall(
                                index=0,
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -5,13 +5,17 @@ import argparse
 import asyncio
 import functools
 import os
-from typing import Any, Optional
+import sys
 from typing import Any, Optional, Union
 from fastapi import Request
 from fastapi.responses import JSONResponse, StreamingResponse
 from starlette.background import BackgroundTask, BackgroundTasks
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              CompletionRequest)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 logger = init_logger(__name__)
@@ -181,7 +185,6 @@ def _validate_truncation_size(
 def show_filtered_argument_or_group_from_help(parser: argparse.ArgumentParser,
                                              subcommand_name: list[str]):
    import sys
    # Only handle --help=<keyword> for the current subcommand.
    # Since subparser_init() runs for all subcommands during CLI setup,
@@ -242,3 +245,18 @@ def show_filtered_argument_or_group_from_help(parser: argparse.ArgumentParser,
            print(f"\nNo group or parameter matching '{search_keyword}'")
            print("Tip: use `--help=listgroup` to view all groups.")
            sys.exit(1)
 def get_max_tokens(max_model_len: int, request: Union[ChatCompletionRequest,
                                                      CompletionRequest],
                   input_length: int, default_sampling_params: dict) -> int:
    max_tokens = getattr(request, "max_completion_tokens",
                         None) or request.max_tokens
    default_max_tokens = max_model_len - input_length
    max_output_tokens = current_platform.get_max_output_tokens(input_length)
    return min(val
               for val in (default_max_tokens, max_tokens, max_output_tokens,
                           default_sampling_params.get("max_tokens"))
               if val is not None)
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 import torch
@@ -184,15 +185,14 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
    def __init__(self,
                 max_num_tokens: int,
-                 world_size: int,
+                 num_dispatchers: int,
                 dp_size: int,
                 block_shape: list[int],
                 per_act_token_quant=False):
        """
        max_num_tokens: Maximum number of tokens from a DP Rank
-        world_size: Number of EP ranks
+        num_dispatchers: The number of DP dispatchers.
-        dp_size: Number of data-parallel ranks
+        block_shape: Block quantization block shape.
-        block_shape: Block quantization block shape
+        per_act_token_quant: Per activation token quantization flag.
        """
        super().__init__(
            FusedMoEQuantConfig(
@@ -202,8 +202,7 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
            ))
        assert self.block_shape == self.DEEPGEMM_BLOCK_SHAPE
        self.max_num_tokens = max_num_tokens
-        self.world_size = world_size
+        self.num_dispatchers = num_dispatchers
        self.dp_size = dp_size
    @property
    def activation_formats(
@@ -233,7 +232,7 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
        # FIXME (varun): We should be able to dispatch only from the leader
        # DP ranks in the case of TP > 1. At the moment, all the Ranks
        # end up sending their tokens. This needs to be fixed.
-        num_dispatchers = self.world_size
+        num_dispatchers = self.num_dispatchers
        num_experts = local_num_experts
        max_num_tokens = a.size(
            0) if self.max_num_tokens is None else self.max_num_tokens
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
simon-mo	a5dd03c1eb	Revert "[V0 deprecation] Remove V0 CPU/XPU/TPU backends (#20412 )" Some checks failed Create Release / Create Release (push) Has been cancelled Details This reverts commit `e202dd2736`.	2025-07-06 14:02:36 -07:00
Cyrus Leung	c18b3b8e8b	[Bugfix] Add `use_cross_encoder` flag to use correct activation in `ClassifierPooler` (#20527 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-07-06 14:01:48 -07:00
Woosuk Kwon	9528e3a05e	[BugFix][Spec Decode] Fix spec token ids in model runner (#20530 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-07-06 19:44:52 +00:00
Cyrus Leung	9fb52e523a	[V1] Support any head size for FlexAttention backend (#20467 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-07-06 09:54:36 -07:00
Woosuk Kwon	e202dd2736	[V0 deprecation] Remove V0 CPU/XPU/TPU backends (#20412 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: jiang1.li <jiang1.li@intel.com> Co-authored-by: Li, Jiang <jiang1.li@intel.com>	2025-07-06 08:48:13 -07:00
Reid	43813e6361	[Misc] call the pre-defined func (#20518 ) Signed-off-by: reidliu41 <reid201711@gmail.com>	2025-07-06 10:25:29 +00:00
Brayden Zhong	cede942b87	[Benchmark] Add support for multiple batch size benchmark through CLI in `benchmark_moe.py` (#20516 ) Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>	2025-07-06 09:20:11 +00:00
Flora Feng	fe1e924811	[Frontend] Support image object in llm.chat (#19635 ) Signed-off-by: sfeng33 <4florafeng@gmail.com> Signed-off-by: Flora Feng <4florafeng@gmail.com>	2025-07-06 06:47:13 +00:00
Chengji Yao	4548c03c50	[TPU][Bugfix] fix the MoE OOM issue (#20339 ) Signed-off-by: Chengji Yao <chengjiyao@google.com>	2025-07-05 21:19:09 -07:00
Lucas Wilkinson	40b86aa05e	[BugFix] Fix: ImportError when building on hopper systems (#20513 ) Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>	2025-07-06 12:17:30 +08:00
Lucia Fang	432870829d	[Bugfix] Fix missing per_act_token parameter in compressed_tensors_moe (#20509 ) Signed-off-by: Lu Fang <fanglu@fb.com>	2025-07-06 12:08:30 +08:00
Vadim Gimpelson	f73d02aadc	[BUG] Fix #20484 . Support empty sequence in cuda penalty kernel (#20491 ) Signed-off-by: Vadim Gimpelson <vadim.gimpelson@centml.ai>	2025-07-05 19:38:02 -07:00
Jeremy Reizenstein	c5ebe040ac	test_attention compat with coming xformers change (#20487 ) Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>	2025-07-05 19:37:59 -07:00
Reid	8d763cb891	[Misc] remove unused import (#20517 ) Signed-off-by: reidliu41 <reid201711@gmail.com>	2025-07-05 19:17:06 -07:00
Reid	cf4cd53982	[Misc] Add logger.exception for TPU information collection failures (#20510 ) Signed-off-by: reidliu41 <reid201711@gmail.com>	2025-07-05 07:24:32 -07:00
Isotr0py	32c9be2200	[v1] Re-add fp32 support to v1 engine through FlexAttention (#19754 ) Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>	2025-07-05 09:41:10 +00:00
Lucia Fang	8aeaa910a2	Fix unknown attribute of topk_indices_dtype in CompressedTensorsW8A8Fp8MoECutlassMethod (#20507 ) Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com>	2025-07-05 14:03:20 +08:00
Jee Jee Li	906e05d840	[Misc] Remove the unused LoRA test code (#20494 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	2025-07-05 13:48:16 +08:00
Reid	ef9a2990ae	[doc] small fix (#20506 ) Signed-off-by: reidliu41 <reid201711@gmail.com>	2025-07-04 20:56:39 -07:00
Reid	7e90870491	[Misc] Add security warning for development mode endpoints (#20508 ) Signed-off-by: reidliu41 <reid201711@gmail.com>	2025-07-04 20:52:13 -07:00
Guy Stone	d3f05c9248	[Doc] fix mutltimodal_inputs.md gh examples link (#20497 ) Signed-off-by: Guy Stone <guys@spotify.com>	2025-07-04 16:41:35 -07:00
Michael Goin	c108781c85	[CI Bugfix] Fix pre-commit failures on main (#20502 )	2025-07-04 14:17:30 -07:00
Duncan Moss	3d184b95b8	[feat]: CUTLASS block scaled group gemm for SM100 (#19757 ) Signed-off-by: Duncan Moss <djm.moss@gmail.com> Co-authored-by: Duncan Moss <dmoss@nvidia.com>	2025-07-04 12:58:04 -06:00
Thomas Parnell	2f35a022e6	Enable V1 for Hybrid SSM/Attention Models (#20016 ) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Stanislaw Wozniak <stw@zurich.ibm.com> Co-authored-by: Tyler Michael Smith <tysmith@redhat.com> Co-authored-by: Chen Zhang <zhangch99@outlook.com>	2025-07-04 17:46:53 +00:00
Chenheli Hua	ffe00ef77a	[Misc] Small: Remove global media connector. Each test should have its own test connector object. (#20395 ) Signed-off-by: Chenheli Hua <huachenheli@outlook.com>	2025-07-04 08:15:03 -07:00
Peter Pan	5561681d04	[CI] add kvcache-connector dependency definition and add into CI build (#18193 ) Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>	2025-07-04 06:49:18 -07:00
Cyrus Leung	fbd62d8750	[Doc] Fix classification table in list of supported models (#20489 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-07-04 06:08:02 -07:00
wang.yuqi	2e26f9156a	[Model][3/N] Automatic conversion of CrossEncoding model (#20168 ) Signed-off-by: wang.yuqi <noooop@126.com>	2025-07-04 05:47:39 -07:00
sangbumlikeagod	9e5452ee34	[Bug][Frontend] Fix structure of transcription's decoder_prompt (#18809 ) Signed-off-by: sangbumlikeagod <oironese@naver.com>	2025-07-04 11:28:07 +00:00
Michael Goin	0e3fe896e2	Support Llama 4 for fused_marlin_moe (#20457 ) Signed-off-by: mgoin <mgoin64@gmail.com>	2025-07-04 07:55:10 +00:00
Jee Jee Li	1caca5a589	[Misc] Add SPDX-FileCopyrightText (#20428 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	2025-07-04 07:40:42 +00:00
Wentao Ye	783921d889	[Perf] Optimize Vectorization Utils for Int 8 Quantization Kernels (#20331 ) Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-07-04 15:06:24 +08:00
Aaron Pham	4a98edff1f	[Structured Outputs][V1] Skipping with models doesn't contain tokenizers (#20365 ) Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Co-authored-by: Nick Hill <nhill@redhat.com>	2025-07-04 15:05:49 +08:00
Reid	a7bab0c9e5	[Misc] small update (#20462 ) Signed-off-by: reidliu41 <reid201711@gmail.com>	2025-07-03 20:33:44 -07:00
汪志鹏	25950dca9b	Add ignore consolidated file in mistral example code (#20420 ) Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>	2025-07-04 02:55:07 +00:00
Gabriel Marinho	a4113b035c	[Platform] Add custom default max tokens (#18557 ) Signed-off-by: Gabriel Marinho <gmarinho@ibm.com>	2025-07-04 10:50:17 +08:00
Michael Goin	7e1665b089	[Misc] Change warn_for_unimplemented_methods to debug (#20455 )	2025-07-04 02:35:08 +00:00
Seiji Eicher	8d1096e7db	[Bugfix] Register reducer even if transformers_modules not available (#19510 ) Signed-off-by: Seiji Eicher <seiji@anyscale.com>	2025-07-03 22:08:12 +00:00
Nicolò Lucchesi	8d775dd30a	[Misc] Fix `Unable to detect current VLLM config. Defaulting to NHD kv cache layout` warning (#20400 ) Signed-off-by: NickLucche <nlucches@redhat.com>	2025-07-03 14:56:09 -07:00
bnellnm	78fe77534b	[Kernel] Enable fp8 support for pplx and BatchedTritonExperts. (#18864 ) Signed-off-by: Bill Nell <bnell@redhat.com>	2025-07-03 14:55:40 -07:00