Bump version to v0.6.1 (#8379 )

Pixtral (#8377 )
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-09-11 14:42:11 -07:00 · 2024-09-11 14:41:55 -07:00 · 2024-09-11 14:07:34 -07:00 · 2024-09-11 13:25:58 -07:00 · 2024-09-11 12:52:19 -07:00 · 2024-09-11 11:36:54 -07:00
201 changed files with 10489 additions and 2915 deletions
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -71,13 +71,36 @@ mkdir -p ${HF_CACHE}
 HF_MOUNT="/root/.cache/huggingface"
 commands=$@
 echo "Commands:$commands"
 #ignore certain kernels tests
 if [[ $commands == *" kernels "* ]]; then
  commands="${commands} \
  --ignore=kernels/test_attention.py \
  --ignore=kernels/test_attention_selector.py \
  --ignore=kernels/test_blocksparse_attention.py \
  --ignore=kernels/test_causal_conv1d.py \
  --ignore=kernels/test_cutlass.py \
  --ignore=kernels/test_encoder_decoder_attn.py \
  --ignore=kernels/test_flash_attn.py \
  --ignore=kernels/test_flashinfer.py \
  --ignore=kernels/test_int8_quant.py \
  --ignore=kernels/test_machete_gemm.py \
  --ignore=kernels/test_mamba_ssm.py \
  --ignore=kernels/test_marlin_gemm.py \
  --ignore=kernels/test_moe.py \
  --ignore=kernels/test_prefix_prefill.py \
  --ignore=kernels/test_rand.py \
  --ignore=kernels/test_sampler.py"
 fi
 PARALLEL_JOB_COUNT=8
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
    #replace shard arguments
-    commands=${@//"--shard-id= "/"--shard-id=${GPU} "}
+    commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
    commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
    echo "Shard ${GPU} commands:$commands"
    docker run \
        --device /dev/kfd --device /dev/dri \
        --network host \
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -0,0 +1,33 @@
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Try building the docker image
 docker build -t cpu-test -f Dockerfile.ppc64le .
 # Setup cleanup
 remove_docker_container() { docker rm -f cpu-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 # Run the image, setting --shm-size=4g for tensor parallel.
 source /etc/environment
 #docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test
 # Run basic model test
 docker exec cpu-test bash -c "
  pip install pytest matplotlib einops transformers_stream_generator
  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
 # online inference
 docker exec cpu-test bash -c "
  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
  timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
  python3 benchmarks/benchmark_serving.py \
    --backend vllm \
    --dataset-name random \
    --model facebook/opt-125m \
    --num-prompts 20 \
    --endpoint /v1/completions \
    --tokenizer facebook/opt-125m"
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -30,6 +30,12 @@ docker exec cpu-test bash -c "
      --ignore=tests/models/test_jamba.py \
      --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
 # Run compressed-tensor test
 docker exec cpu-test bash -c "
  pytest -s -v \
  tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
  tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
 # online inference
 docker exec cpu-test bash -c "
  export VLLM_CPU_KVCACHE_SPACE=10 
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -158,6 +158,7 @@ steps:
    - python3 offline_inference_with_prefix.py
    - python3 llm_engine_example.py
    - python3 offline_inference_vision_language.py
    - python3 offline_inference_vision_language_multi_image.py
    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference_encoder_decoder.py
@@ -216,7 +217,8 @@ steps:
  commands:
    # See https://github.com/vllm-project/vllm/issues/5152
    - export VLLM_ATTENTION_BACKEND=XFORMERS
-    - pytest -v -s spec_decode
+    - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
    - pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
 - label: LoRA Test %N # 30min each
  mirror_hardwares: [amd]
@@ -227,6 +229,7 @@ steps:
  parallelism: 4
 - label: Kernels Test %N # 30min each
  mirror_hardwares: [amd]
  source_file_dependencies:
  - csrc/
  - vllm/attention
@@ -368,6 +371,7 @@ steps:
 - label: LoRA Long Context (Distributed) # 11min
  # This test runs llama 13B, so it is required to run on 4 GPUs.
  num_gpus: 4
  soft_fail: true
  source_file_dependencies:
  - vllm/lora
  - tests/lora/test_long_context
@@ -384,7 +388,18 @@ steps:
  - vllm/
  - tests/weight_loading
  commands:
-    - bash weight_loading/run_model_weight_loading_test.sh
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
 - label: Weight Loading Multiple GPU Test - Large Models # optional
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  gpu: a100
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt 
 ##### multi gpus test #####
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -39,6 +39,16 @@ FIX #xxxx (*link existing issues this PR will resolve*)
    <li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
 </ul>
 <h3>Adding or changing kernels</h3>
 <p>Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.</p>
 <ul>
    <li>Make sure custom ops are registered following PyTorch guidelines: <a href="https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial">Custom C++ and CUDA Operators</a> and <a href="https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU">The Custom Operators Manual</a></li>
    <li>Custom operations that return <code>Tensors</code> require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.</li>
    <li>Use <a href="https://pytorch.org/docs/stable/library.html#torch.library.opcheck"><code>torch.libary.opcheck()</code></a> to test the function registration and meta-function for any registered ops.  See <code>tests/kernels</code> for examples.</li>
    <li>When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.</li>
    <li>If a new custom type is needed, see the following document: <a href="https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA">Custom Class Support in PT2</a>.
 </ul>
 <h3>Notes for Large Changes</h3>
 <p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -181,7 +181,6 @@ set(VLLM_EXT_SRC
  "csrc/pos_encoding_kernels.cu"
  "csrc/activation_kernels.cu"
  "csrc/layernorm_kernels.cu"
  "csrc/quantization/squeezellm/quant_cuda_kernel.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
  "csrc/quantization/fp8/common.cu"
@@ -196,9 +195,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  FetchContent_Declare(
        cutlass
        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        # CUTLASS 3.5.1
+        GIT_TAG v3.5.1
        GIT_TAG 06b21349bcf6ddf6a1686a47a137ad1446579db9 
        GIT_PROGRESS TRUE
        # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
        # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
        # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
        GIT_SHALLOW TRUE
  )
  FetchContent_MakeAvailable(cutlass)
@@ -232,6 +235,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
          "-gencode arch=compute_90a,code=sm_90a")
  endif()
  #
  # Machete kernels
@@ -290,6 +294,12 @@ define_gpu_extension_target(
  USE_SABI 3
  WITH_SOABI)
 # If CUTLASS is compiled on NVCC >= 12.5, it by default uses 
 # cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the 
 # driver API. This causes problems when linking with earlier versions of CUDA.
 # Setting this variable sidesteps the issue by calling the driver directly.
 target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 #
 # _moe_C extension
 #
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,128 @@
 # vLLM Code of Conduct
 ## Our Pledge
 We as members, contributors, and leaders pledge to make participation in our
 community a harassment-free experience for everyone, regardless of age, body
 size, visible or invisible disability, ethnicity, sex characteristics, gender
 identity and expression, level of experience, education, socioeconomic status,
 nationality, personal appearance, race, caste, color, religion, or sexual
 identity and orientation.
 We pledge to act and interact in ways that contribute to an open, welcoming,
 diverse, inclusive, and healthy community.
 ## Our Standards
 Examples of behavior that contributes to a positive environment for our
 community include:
 * Demonstrating empathy and kindness toward other people
 * Being respectful of differing opinions, viewpoints, and experiences
 * Giving and gracefully accepting constructive feedback
 * Accepting responsibility and apologizing to those affected by our mistakes,
  and learning from the experience
 * Focusing on what is best not just for us as individuals, but for the overall
  community
 Examples of unacceptable behavior include:
 * The use of sexualized language or imagery, and sexual attention or advances of
  any kind
 * Trolling, insulting or derogatory comments, and personal or political attacks
 * Public or private harassment
 * Publishing others' private information, such as a physical or email address,
  without their explicit permission
 * Other conduct which could reasonably be considered inappropriate in a
  professional setting
 ## Enforcement Responsibilities
 Community leaders are responsible for clarifying and enforcing our standards of
 acceptable behavior and will take appropriate and fair corrective action in
 response to any behavior that they deem inappropriate, threatening, offensive,
 or harmful.
 Community leaders have the right and responsibility to remove, edit, or reject
 comments, commits, code, wiki edits, issues, and other contributions that are
 not aligned to this Code of Conduct, and will communicate reasons for moderation
 decisions when appropriate.
 ## Scope
 This Code of Conduct applies within all community spaces, and also applies when
 an individual is officially representing the community in public spaces.
 Examples of representing our community include using an official email address,
 posting via an official social media account, or acting as an appointed
 representative at an online or offline/IRL event.
 ## Enforcement
 Instances of abusive, harassing, or otherwise unacceptable behavior may be
 reported to the community leaders responsible for enforcement in the #code-of-conduct
 channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
 All complaints will be reviewed and investigated promptly and fairly.
 All community leaders are obligated to respect the privacy and security of the
 reporter of any incident.
 ## Enforcement Guidelines
 Community leaders will follow these Community Impact Guidelines in determining
 the consequences for any action they deem in violation of this Code of Conduct:
 ### 1. Correction
 **Community Impact**: Use of inappropriate language or other behavior deemed
 unprofessional or unwelcome in the community.
 **Consequence**: A private, written warning from community leaders, providing
 clarity around the nature of the violation and an explanation of why the
 behavior was inappropriate. A public apology may be requested.
 ### 2. Warning
 **Community Impact**: A violation through a single incident or series of
 actions.
 **Consequence**: A warning with consequences for continued behavior. No
 interaction with the people involved, including unsolicited interaction with
 those enforcing the Code of Conduct, for a specified period of time. This
 includes avoiding interactions in community spaces as well as external channels
 like social media. Violating these terms may lead to a temporary or permanent
 ban.
 ### 3. Temporary Ban
 **Community Impact**: A serious violation of community standards, including
 sustained inappropriate behavior.
 **Consequence**: A temporary ban from any sort of interaction or public
 communication with the community for a specified period of time. No public or
 private interaction with the people involved, including unsolicited interaction
 with those enforcing the Code of Conduct, is allowed during this period.
 Violating these terms may lead to a permanent ban.
 ### 4. Permanent Ban
 **Community Impact**: Demonstrating a pattern of violation of community
 standards, including sustained inappropriate behavior, harassment of an
 individual, or aggression toward or disparagement of classes of individuals.
 **Consequence**: A permanent ban from any sort of public interaction within the
 community.
 ## Attribution
 This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/),
 version 2.1, available at
 [v2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html).
 Community Impact Guidelines were inspired by
 [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion).
 For answers to common questions about this code of conduct, see the
 [Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
 [Contributor Covenant translations](https://www.contributor-covenant.org/translations).
--- a/11
+++ b/11
@@ -10,7 +10,7 @@ ARG CUDA_VERSION=12.4.1
 # prepare basic build environment
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
 ARG CUDA_VERSION=12.4.1
-ARG PYTHON_VERSION=3.10
+ARG PYTHON_VERSION=3.12
 ENV DEBIAN_FRONTEND=noninteractive
 # Install Python and other dependencies
@@ -37,7 +37,6 @@ WORKDIR /workspace
 # install build and runtime dependencies
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-adag.txt requirements-adag.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -r requirements-cuda.txt
@@ -66,7 +65,6 @@ COPY setup.py setup.py
 COPY cmake cmake
 COPY CMakeLists.txt CMakeLists.txt
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-adag.txt requirements-adag.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 COPY pyproject.toml pyproject.toml
 COPY vllm vllm
@@ -135,7 +133,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 # image with vLLM installed
 FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
 ARG CUDA_VERSION=12.4.1
-ARG PYTHON_VERSION=3.10
+ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
@@ -147,6 +145,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
    && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
    && add-apt-repository ppa:deadsnakes/ppa \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
@@ -181,6 +180,10 @@ FROM vllm-base AS test
 ADD . /vllm-workspace/
 # install development dependencies (for testing)
 # A newer setuptools is required for installing some test dependencies from source that do not publish python 3.12 wheels
 # This installation must complete before the test dependencies are collected and installed.
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install "setuptools>=74.1.1"
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -r requirements-dev.txt
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -2,9 +2,14 @@
 FROM ubuntu:22.04 AS cpu-test-1
 ENV CCACHE_DIR=/root/.cache/ccache
 ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
 RUN --mount=type=cache,target=/var/cache/apt \
    apt-get update -y \
    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 # https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
@@ -25,6 +30,19 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    pip install --upgrade pip && \
    pip install -r requirements-build.txt
 # install oneDNN
 RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
 RUN --mount=type=cache,target=/root/.cache/ccache \
    cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \ 
    -DONEDNN_BUILD_DOC=OFF \ 
    -DONEDNN_BUILD_EXAMPLES=OFF \ 
    -DONEDNN_BUILD_TESTS=OFF \ 
    -DONEDNN_BUILD_GRAPH=OFF \ 
    -DONEDNN_ENABLE_WORKLOAD=INFERENCE \ 
    -DONEDNN_ENABLE_PRIMITIVE=MATMUL && \
    cmake --build ./oneDNN/build --target install --config Release
 FROM cpu-test-1 AS build
 WORKDIR /workspace/vllm
@@ -40,7 +58,6 @@ COPY ./ ./
 ARG VLLM_CPU_DISABLE_AVX512
 ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=cache,target=/root/.cache/ccache \
    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -6,7 +6,9 @@ FROM $BASE_IMAGE
 RUN echo "Base image is $BASE_IMAGE"
 # Install some basic utilities
-RUN apt-get update && apt-get install python3 python3-pip -y
+RUN apt-get update \
    && apt-get install python3 python3-pip -y \
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 
 ### Mount Point ###
 # When launching the container, mount the code directory to /app
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -4,7 +4,8 @@
 FROM ubuntu:22.04 AS dev
 RUN apt-get update -y && \
-    apt-get install -y python3-pip git
+    apt-get install -y python3-pip git && \
    apt-get install -y ffmpeg libsm6 libxext6 libgl1 
 WORKDIR /workspace
 # copy requirements
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -2,21 +2,26 @@ FROM mambaorg/micromamba
 ARG MAMBA_DOCKERFILE_ACTIVATE=1
 USER root
-RUN apt-get update  -y     && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
 RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 
 # Some packages in requirements-cpu are installed here
 # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
 # Currently these may not be available for venv or pip directly
-RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults     python=3.10     pytorch-cpu=2.1.2     torchvision-cpu=0.16.2    &&     micromamba clean --all --yes
+RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes
 COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm
 # These packages will be in rocketce eventually
-RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
+RUN pip install -v cmake xformers torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
 RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
-WORKDIR /vllm-workspace
+WORKDIR /workspace/
-ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
+
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -4,6 +4,9 @@ ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:night
 FROM $BASE_IMAGE
 WORKDIR /workspace
 # Install some basic utilities
 RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libgl1 
 # Install the TPU and Pallas dependencies.
 RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
 RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -9,8 +9,7 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
    chmod 644 /usr/share/keyrings/intel-graphics.gpg
 RUN apt-get update  -y \
-&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
+&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1 
 COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,5 +1,4 @@
 include LICENSE
 include requirements-adag.txt
 include requirements-common.txt
 include requirements-cuda.txt
 include requirements-rocm.txt
--- a/README.md
+++ b/README.md
@@ -17,15 +17,16 @@ Easy, fast, and cheap LLM serving for everyone
 ---
-**vLLM & NVIDIA Triton User Meetup (Monday, September 9, 5pm-9pm PT) at Fort Mason, San Francisco**
+**vLLM, AMD, Anyscale Meet & Greet at [Ray Summit 2024](http://raysummit.anyscale.com) (Monday, Sept 30th, 5-7pm PT) at Marriott Marquis San Francisco**
-We are excited to announce our sixth vLLM Meetup, in collaboration with NVIDIA Triton Team.
+We are excited to announce our special vLLM event in collaboration with AMD and Anyscale.
-Join us to hear the vLLM's recent update about performance.
+Join us to learn more about recent advancements of vLLM on MI300X.
-Register now [here](https://lu.ma/87q3nvnh) and be part of the event!
+Register [here](https://lu.ma/db5ld9n5) and be a part of the event!
 ---
 *Latest News* 🔥
 - [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
 - [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
 - [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
 - [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
@@ -130,3 +131,10 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
  year={2023}
 }
 ```
 ## Contact Us
 * For technical questions and feature requests, please use Github issues or discussions.
 * For discussing with fellow users, please use Discord.
 * For security disclosures, please use Github's security advisory feature.
 * For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -24,6 +24,7 @@ class RequestFuncInput:
    model: str
    best_of: int = 1
    use_beam_search: bool = False
    logprobs: Optional[int] = None
@dataclass
@@ -236,6 +237,7 @@ async def async_request_openai_completions(
            "temperature": 0.0,
            "best_of": request_func_input.best_of,
            "max_tokens": request_func_input.output_len,
            "logprobs": request_func_input.logprobs,
            "stream": True,
        }
        headers = {
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -10,7 +10,7 @@ import torch
 from tqdm import tqdm
 from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
 from vllm.inputs import PromptInputs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.utils import FlexibleArgumentParser
@@ -205,13 +205,11 @@ if __name__ == '__main__':
        default=None,
        help=('path to save the pytorch profiler output. Can be visualized '
              'with ui.perfetto.dev or Tensorboard.'))
-    parser.add_argument(
+    parser.add_argument("--device",
-        "--device",
+                        type=str,
-        type=str,
+                        default="auto",
-        default="auto",
+                        choices=DEVICE_OPTIONS,
-        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
+                        help='device type for vLLM execution')
        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
        'CPU.')
    parser.add_argument('--block-size',
                        type=int,
                        default=16,
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -195,8 +195,16 @@ def sample_sonnet_requests(
 def sample_random_requests(
-        input_len: int, output_len: int, num_prompts: int, range_ratio: float,
+    prefix_len: int,
-        tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:
+    input_len: int,
    output_len: int,
    num_prompts: int,
    range_ratio: float,
    tokenizer: PreTrainedTokenizerBase,
 ) -> List[Tuple[str, int, int]]:
    prefix_token_ids = np.random.randint(0,
                                         tokenizer.vocab_size,
                                         size=prefix_len).tolist()
    input_lens = np.random.randint(
        int(input_len * range_ratio),
@@ -211,10 +219,12 @@ def sample_random_requests(
    offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
    input_requests = []
    for i in range(num_prompts):
-        prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
+        prompt = tokenizer.decode(prefix_token_ids +
                                  [(offsets[i] + i + j) % tokenizer.vocab_size
                                   for j in range(input_lens[i])])
        input_requests.append(
-            (prompt, int(input_lens[i]), int(output_lens[i])))
+            (prompt, int(prefix_len + input_lens[i]), int(output_lens[i])))
    return input_requests
@@ -318,6 +328,7 @@ async def benchmark(
    model_id: str,
    tokenizer: PreTrainedTokenizerBase,
    input_requests: List[Tuple[str, int, int]],
    logprobs: Optional[int],
    best_of: int,
    use_beam_search: bool,
    request_rate: float,
@@ -339,6 +350,7 @@ async def benchmark(
        api_url=api_url,
        prompt_len=test_prompt_len,
        output_len=test_output_len,
        logprobs=logprobs,
        best_of=best_of,
        use_beam_search=use_beam_search,
    )
@@ -358,6 +370,7 @@ async def benchmark(
            api_url=base_url + "/start_profile",
            prompt_len=test_prompt_len,
            output_len=test_output_len,
            logprobs=logprobs,
            best_of=best_of,
            use_beam_search=use_beam_search,
        )
@@ -379,6 +392,7 @@ async def benchmark(
            api_url=api_url,
            prompt_len=prompt_len,
            output_len=output_len,
            logprobs=logprobs,
            best_of=best_of,
            use_beam_search=use_beam_search,
        )
@@ -396,6 +410,7 @@ async def benchmark(
            api_url=base_url + "/stop_profile",
            prompt_len=test_prompt_len,
            output_len=test_output_len,
            logprobs=logprobs,
            best_of=best_of,
            use_beam_search=use_beam_search,
        )
@@ -562,6 +577,7 @@ def main(args: argparse.Namespace):
    elif args.dataset_name == "random":
        input_requests = sample_random_requests(
            prefix_len=args.random_prefix_len,
            input_len=args.random_input_len,
            output_len=args.random_output_len,
            num_prompts=args.num_prompts,
@@ -580,6 +596,7 @@ def main(args: argparse.Namespace):
            model_id=model_id,
            tokenizer=tokenizer,
            input_requests=input_requests,
            logprobs=args.logprobs,
            best_of=args.best_of,
            use_beam_search=args.use_beam_search,
            request_rate=args.request_rate,
@@ -721,6 +738,16 @@ if __name__ == "__main__":
        help=
        "Number of output tokens per request, used only for sonnet dataset.",
    )
    parser.add_argument(
        "--logprobs",
        type=int,
        default=None,
        help=("Number of logprobs-per-token to compute & return as part of "
              "the request. If unspecified, then either (1) if beam search "
              "is disabled, no logprobs are computed & a single dummy "
              "logprob is returned for each token; or (2) if beam search "
              "is enabled 1 logprob per token is computed"),
    )
    parser.add_argument(
        "--sonnet-prefix-len",
        type=int,
@@ -749,6 +776,14 @@ if __name__ == "__main__":
        help="Range of sampled ratio of input/output length, "
        "used only for random sampling.",
    )
    parser.add_argument(
        "--random-prefix-len",
        type=int,
        default=0,
        help="Number of fixed prefix tokens before random "
        " context. The length range of context in a random "
        " request is [random-prefix-len, "
        " random-prefix-len + random-prefix-len * random-range-ratio).")
    parser.add_argument(
        "--request-rate",
        type=float,
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -11,7 +11,7 @@ from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          PreTrainedTokenizerBase)
-from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.openai.api_server import (
    build_async_engine_client_from_engine_args)
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
@@ -451,13 +451,11 @@ if __name__ == "__main__":
        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
        'instead supported for common inference criteria.')
-    parser.add_argument(
+    parser.add_argument("--device",
-        "--device",
+                        type=str,
-        type=str,
+                        default="auto",
-        default="auto",
+                        choices=DEVICE_OPTIONS,
-        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
+                        help='device type for vLLM execution')
        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
        'CPU.')
    parser.add_argument(
        "--num-scheduler-steps",
        type=int,
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -1,4 +1,5 @@
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_CXX_STANDARD 17)
 #
 # Define environment variables for special configurations
@@ -83,12 +84,7 @@ endif()
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
-list(APPEND LIBS "numa")
+list(APPEND LIBS dnnl numa)
 #
 # Define extension targets
 #
 #
 # _C extension
@@ -102,6 +98,16 @@ set(VLLM_EXT_SRC
    "csrc/cpu/pos_encoding.cpp"
    "csrc/cpu/torch_bindings.cpp")
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
    set(VLLM_EXT_SRC
        "csrc/cpu/quant.cpp"
        ${VLLM_EXT_SRC})
 endif()
 #
 # Define extension targets
 #
 define_gpu_extension_target(
    _C
    DESTINATION vllm
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -350,6 +350,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
  target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
    ${GPU_INCLUDE_DIRECTORIES})
  # TODO: is torch_python_LIBRARY needed?
  target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${torch_python_LIBRARY}
    ${GPU_LIBRARIES})
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -24,8 +24,8 @@ namespace vec_op {
 #define CPU_KERNEL_GUARD_OUT(NAME)
 #else
 #define CPU_KERNEL_GUARD_IN(NAME)                                              \
-  std::cout << #NAME << " invoked." << std::endl;
+  RECORD_FUNCTION(#NAME, c10::ArrayRef<c10::IValue>({}));
-#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
+#define CPU_KERNEL_GUARD_OUT(NAME)
 #endif
 #define FORCE_INLINE __attribute__((always_inline)) inline
@@ -106,6 +106,12 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
  explicit BF16Vec16(const FP32Vec16 &);
  void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
  void save(void* ptr, const int elem_num) const {
    constexpr uint32_t M = 0xFFFFFFFF;
    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
    _mm256_mask_storeu_epi16(ptr, mask, reg);
  }
 };
 #ifdef __AVX512F__
@@ -313,8 +319,28 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
    return FP32Vec16(_mm512_div_ps(reg, b.reg));
  }
  FP32Vec16 clamp(const FP32Vec16& min, const FP32Vec16& max) const {
    return FP32Vec16(_mm512_min_ps(max.reg, _mm512_max_ps(min.reg, reg)));
  }
  FP32Vec16 max(const FP32Vec16& b) const {
    return FP32Vec16(_mm512_max_ps(reg, b.reg));
  }
  FP32Vec16 max(const FP32Vec16& b, const int elem_num) const {
    constexpr uint32_t M = 0xFFFFFFFF;
    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
    return FP32Vec16(_mm512_mask_max_ps(reg, mask, reg, b.reg));
  }
  FP32Vec16 abs() const {
    return FP32Vec16(_mm512_abs_ps(reg));
  } 
  float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
  float reduce_max() const { return _mm512_reduce_max_ps(reg); }
  template <int group_size> float reduce_sub_sum(int idx) {
    static_assert(VEC_ELEM_NUM % group_size == 0);
    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
@@ -323,6 +349,12 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
  }
  void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
  void save(float* ptr, const int elem_num) const {
    constexpr uint32_t M = 0xFFFFFFFF;
    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
    _mm512_mask_storeu_ps(ptr, mask, reg);
  }
 };
 #else
 struct FP32Vec16 : public Vec<FP32Vec16> {
@@ -433,6 +465,32 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 };
 #endif
 #ifdef __AVX512F__
 struct INT8Vec16: public Vec<INT8Vec16> {
  constexpr static int VEC_ELEM_NUM = 16;
  union AliasReg {
    __m128i reg;
    int8_t values[VEC_ELEM_NUM];
  };
  __m128i reg;
  explicit INT8Vec16(const FP32Vec16& vec) : reg(
    _mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32(vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))
  ) {}
  void save(int8_t* ptr) const {
    _mm_storeu_epi8(ptr, reg);
  }
  void save(int8_t* ptr, const int elem_num) const {
    constexpr uint32_t M = 0xFFFFFFFF;
    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
    _mm_mask_storeu_epi8(ptr, mask, reg);
  }
 };
 #endif
 template <typename T> struct VecType { using vec_type = void; };
 template <typename T> using vec_t = typename VecType<T>::vec_type;
--- a/csrc/cpu/dnnl_helper.hpp
+++ b/csrc/cpu/dnnl_helper.hpp
@@ -0,0 +1,168 @@
 #ifndef DNNL_HELPER_HPP
 #define DNNL_HELPER_HPP
 #include <c10/util/BFloat16.h>
 #include "oneapi/dnnl/dnnl.hpp"
 namespace {
 template <typename T>
 struct DNNLType {
  static constexpr dnnl::memory::data_type type =
      dnnl::memory::data_type::undef;
 };
 template <>
 struct DNNLType<int8_t> {
  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8;
 };
 template <>
 struct DNNLType<int32_t> {
  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32;
 };
 template <>
 struct DNNLType<float> {
  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32;
 };
 template <>
 struct DNNLType<c10::BFloat16> {
  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
 };
 template <typename T>
 constexpr inline dnnl::memory::data_type get_dnnl_type() {
  return DNNLType<std::decay_t<T>>::type;
 }
 };  // namespace
 template <bool InputNoScale>
 class DNNLPrimitiveHelper {
 public:
  // I8 input GEMM kernel (C = a_scales * A @ (b_scales * B^T) + bias)
  // A: [M, K], row-major
  // B: [K, N], column-major
  // C: [M, N], row-major
  // bias: [N], row-major, optional
  // a_scales: [MS]
  // b_scales: [NS]
  // Note: Due to the limitation of oneDNN
  // (https://github.com/oneapi-src/oneDNN/issues/1636), the quantized bias is
  // not supported.
  template <typename OutputT, typename BiasT>
  static void gemm_s8s8_jit(const int8_t* a, const int8_t* b, OutputT* c,
                            const BiasT* bias, dnnl_dim_t M, dnnl_dim_t N,
                            dnnl_dim_t K, const float* a_scales,
                            const float* b_scales, dnnl_dim_t MS,
                            dnnl_dim_t NS) {
    auto&& OutputType = get_dnnl_type<OutputT>();
    auto&& BiasType = get_dnnl_type<BiasT>();
    dnnl::memory::desc a_md({M, K}, dnnl::memory::data_type::s8, {K, 1});
    dnnl::memory::desc b_md({K, N}, dnnl::memory::data_type::s8, {1, K});
    dnnl::memory::desc c_md({M, N}, OutputType, {N, 1});
    dnnl::primitive_attr attr;
    if constexpr (!InputNoScale) {
      if (MS == 1) {
        // per-tensor
        attr.set_scales_mask(DNNL_ARG_SRC, 0);
      } else {
        // per-token
        TORCH_CHECK(false, "per-token quantization is unsupported.");
      }
    }
    if (NS == 1) {
      // per-tensor
      attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
    } else {
      // per-channel
      attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2);
    }
    dnnl::matmul::primitive_desc matmul_pd;
    if (bias) {
      dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1});
      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
                                               bias_md, c_md, attr);
    } else {
      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
                                               c_md, attr);
    }
    dnnl::matmul matmul(matmul_pd);
    auto& engine = default_engine();
    dnnl::memory a_m(a_md, engine, (void*)a);
    dnnl::memory b_m(b_md, engine, (void*)b);
    dnnl::memory c_m(c_md, engine, (void*)c);
    dnnl::memory a_scales_m({{MS}, dnnl::memory::data_type::f32, {1}}, engine,
                            (void*)a_scales);
    dnnl::memory b_scales_m({{NS}, dnnl::memory::data_type::f32, {1}}, engine,
                            (void*)b_scales);
    auto& stream = default_stream();
    if constexpr (InputNoScale) {
      if (bias) {
        dnnl::memory::desc bias_md({N}, BiasType, {1});
        dnnl::memory bias_m(bias_md, engine, (void*)bias);
        matmul.execute(
            stream, {
                        {DNNL_ARG_SRC, a_m},
                        {DNNL_ARG_WEIGHTS, b_m},
                        {DNNL_ARG_BIAS, bias_m},
                        {DNNL_ARG_DST, c_m},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
                    });
      } else {
        matmul.execute(
            stream, {
                        {DNNL_ARG_SRC, a_m},
                        {DNNL_ARG_WEIGHTS, b_m},
                        {DNNL_ARG_DST, c_m},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
                    });
      }
    } else {
      if (bias) {
        dnnl::memory::desc bias_md({N}, BiasType, {1});
        dnnl::memory bias_m(bias_md, engine, (void*)bias);
        matmul.execute(
            stream, {
                        {DNNL_ARG_SRC, a_m},
                        {DNNL_ARG_WEIGHTS, b_m},
                        {DNNL_ARG_BIAS, bias_m},
                        {DNNL_ARG_DST, c_m},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
                    });
      } else {
        matmul.execute(
            stream, {
                        {DNNL_ARG_SRC, a_m},
                        {DNNL_ARG_WEIGHTS, b_m},
                        {DNNL_ARG_DST, c_m},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
                    });
      }
    }
    stream.wait();
  }
 private:
  static dnnl::engine& default_engine() {
    static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
    return engine;
  }
  static dnnl::stream& default_stream() {
    static dnnl::stream stream(default_engine());
    return stream;
  }
 };
 #endif
--- a/csrc/cpu/quant.cpp
+++ b/csrc/cpu/quant.cpp
@@ -0,0 +1,294 @@
 #include "cpu_types.hpp"
 #include "dnnl_helper.hpp"
 namespace {
 template <typename scalar_t>
 struct KernelVecType {
  using load_vec_type = void;
  using cvt_vec_type = void;
 };
 template <>
 struct KernelVecType<float> {
  using load_vec_type = vec_op::FP32Vec16;
  using cvt_vec_type = vec_op::FP32Vec16;
 };
 template <>
 struct KernelVecType<c10::BFloat16> {
  using load_vec_type = vec_op::BF16Vec16;
  using cvt_vec_type = vec_op::FP32Vec16;
 };
 #ifdef __AVX512F__
 template <typename scalar_t>
 void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
                                   const float* scale, const int num_tokens,
                                   const int hidden_size) {
  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
  constexpr float i8_min =
      static_cast<float>(std::numeric_limits<int8_t>::min());
  constexpr float i8_max =
      static_cast<float>(std::numeric_limits<int8_t>::max());
  const cvt_vec_t inv_scale(1.0 / *scale);
  const cvt_vec_t i8_min_vec(i8_min);
  const cvt_vec_t i8_max_vec(i8_max);
  #pragma omp parallel for
  for (int i = 0; i < num_tokens; ++i) {
    int j = 0;
    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
      load_vec_t elems(input + i * hidden_size + j);
      cvt_vec_t elems_fp32(elems);
      elems_fp32 = (elems_fp32 * inv_scale).clamp(i8_min_vec, i8_max_vec);
      vec_op::INT8Vec16 elems_int8(elems_fp32);
      elems_int8.save(output + i * hidden_size + j);
    }
    load_vec_t elems(input + i * hidden_size + j);
    cvt_vec_t elems_fp32(elems);
    elems_fp32 = (elems_fp32 * inv_scale).clamp(i8_min_vec, i8_max_vec);
    vec_op::INT8Vec16 elems_int8(elems_fp32);
    if (j + vec_elem_num == hidden_size) {
      elems_int8.save(output + i * hidden_size + j);
    } else {
      elems_int8.save(output + i * hidden_size + j, hidden_size - j);
    }
  }
 }
 template <typename scalar_t>
 void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
                                    float* scale, const int num_tokens,
                                    const int hidden_size) {
  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
  #pragma omp parallel for
  for (int i = 0; i < num_tokens; ++i) {
    cvt_vec_t max_abs(0.0);
    {
      int j = 0;
      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
        load_vec_t elems(input + i * hidden_size + j);
        cvt_vec_t elems_fp32(elems);
        max_abs = max_abs.max(elems_fp32.abs());
      }
      load_vec_t elems(input + i * hidden_size + j);
      cvt_vec_t elems_fp32(elems);
      if (j + vec_elem_num == hidden_size) {
        max_abs = max_abs.max(elems_fp32.abs());
      } else {
        max_abs = max_abs.max(elems_fp32.abs(), hidden_size - j);
      }
    }
    float scale_val = max_abs.reduce_max() / 127.0f;
    scale[i] = scale_val;
    const cvt_vec_t inv_scale(1.0 / scale_val);
    {
      int j = 0;
      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
        load_vec_t elems(input + i * hidden_size + j);
        cvt_vec_t elems_fp32(elems);
        elems_fp32 = (elems_fp32 * inv_scale);
        vec_op::INT8Vec16 elems_int8(elems_fp32);
        elems_int8.save(output + i * hidden_size + j);
      }
      load_vec_t elems(input + i * hidden_size + j);
      cvt_vec_t elems_fp32(elems);
      elems_fp32 = (elems_fp32 * inv_scale);
      vec_op::INT8Vec16 elems_int8(elems_fp32);
      if (j + vec_elem_num == hidden_size) {
        elems_int8.save(output + i * hidden_size + j);
      } else {
        elems_int8.save(output + i * hidden_size + j, hidden_size - j);
      }
    }
  }
 }
 template <bool Bias, typename scalar_t>
 void dynamic_output_scale_impl(const float* input, scalar_t* output,
                               const float* scale, const scalar_t* bias,
                               const int num_tokens, const int hidden_size) {
  CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl)
  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
  #pragma omp parallel for
  for (int i = 0; i < num_tokens; ++i) {
    int j = 0;
    cvt_vec_t token_scale_vec(scale[i]);
    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
      cvt_vec_t elems_fp32(input + i * hidden_size + j);
      elems_fp32 = elems_fp32 * token_scale_vec;
      if constexpr (Bias) {
        load_vec_t bias_vec(bias + j);
        cvt_vec_t bias_vec_fp32(bias_vec);
        elems_fp32 = elems_fp32 + bias_vec_fp32;
      }
      load_vec_t elems_out(elems_fp32);
      elems_out.save(output + i * hidden_size + j);
    }
    cvt_vec_t elems_fp32(input + i * hidden_size + j);
    elems_fp32 = elems_fp32 * token_scale_vec;
    if constexpr (Bias) {
      load_vec_t bias_vec(bias + j);
      cvt_vec_t bias_vec_fp32(bias_vec);
      elems_fp32 = elems_fp32 + bias_vec_fp32;
    }
    load_vec_t elems_out(elems_fp32);
    if (j + vec_elem_num == hidden_size) {
      elems_out.save(output + i * hidden_size + j);
    } else {
      elems_out.save(output + i * hidden_size + j, hidden_size - j);
    }
  }
 }
 #else
 template <typename scalar_t>
 void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
                                   const float* scale, const int num_tokens,
                                   const int hidden_size) {
  TORCH_CHECK(false, "static_scaled_int8_quant_impl requires AVX512 support.")
 }
 template <typename scalar_t>
 void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
                                    float* scale, const int num_tokens,
                                    const int hidden_size) {
  TORCH_CHECK(false, "dynamic_scaled_int8_quant_impl requires AVX512 support.")
 }
 template <typename scalar_t>
 void dynamic_output_scale_impl() {
  TORCH_CHECK(false, "dynamic_output_scale_impl requires AVX512 support.")
 }
 #endif
 }  // namespace
 void int8_scaled_mm(torch::Tensor& c,               // [M, OC], row-major
                    const torch::Tensor& a,         // [M, IC], row-major
                    const torch::Tensor& b,         // [IC, OC], column-major
                    const torch::Tensor& a_scales,  // [1] or [M]
                    const torch::Tensor& b_scales,  // [1] or [OC]
                    const c10::optional<torch::Tensor>& bias  // [OC]
 ) {
  CPU_KERNEL_GUARD_IN(cutlass_scaled_mm)
  // Checks for conformality
  TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
              "int8_scaled_mm only supports INT8 inputs.")
  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
              b.size(1) == c.size(1));
  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
  // Check for strides and alignment
  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
  TORCH_CHECK(c.stride(0) % 16 == 0 &&
              b.stride(1) % 16 == 0);  // 16 Byte Alignment
  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
  if (bias) {
    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
                bias->dim() == 1);
  }
  VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "cutlass_scaled_mm", [&] {
    if (a_scales.numel() != 1) {
      // per-token
      // Note: oneDNN doesn't support per-token activation quantization
      torch::Tensor tmp_fp32_out =
          torch::empty_like(c, ::at::ScalarType::Float);
      DNNLPrimitiveHelper<true>::gemm_s8s8_jit(
          a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
          tmp_fp32_out.data_ptr<float>(), (void*)(0), a.size(0), b.size(1),
          a.size(1), (float*)(0), b_scales.data_ptr<float>(), 0,
          b_scales.numel());
      if (bias.has_value()) {
        dynamic_output_scale_impl<true>(
            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
            a_scales.data_ptr<float>(), bias->data_ptr<scalar_t>(), c.size(0),
            c.size(1));
      } else {
        dynamic_output_scale_impl<false>(
            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
            a_scales.data_ptr<float>(), (scalar_t*)(0), c.size(0), c.size(1));
      }
    } else {
      // per-tensor
      if (bias.has_value()) {
        DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
            bias->data_ptr<scalar_t>(), a.size(0), b.size(1), a.size(1),
            a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
            a_scales.numel(), b_scales.numel());
      } else {
        DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
            (void*)(0), a.size(0), b.size(1), a.size(1),
            a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
            a_scales.numel(), b_scales.numel());
      }
    }
  });
 }
 // static-per-tensor quantization.
 void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
                              const torch::Tensor& input,  // [..., hidden_size]
                              const torch::Tensor& scale) {
  CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(out.is_contiguous());
  TORCH_CHECK(scale.numel() == 1);
  const int hidden_size = input.size(-1);
  const int num_tokens = input.numel() / hidden_size;
  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "static_scaled_int8_quant_impl", [&] {
        static_scaled_int8_quant_impl(
            input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
            scale.data_ptr<float>(), num_tokens, hidden_size);
      });
 }
 // dynamic-per-token quantization.
 void dynamic_scaled_int8_quant(
    torch::Tensor& out,          // [..., hidden_size]
    const torch::Tensor& input,  // [..., hidden_size]
    torch::Tensor& scale         // [..., 1]
 ) {
  CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(out.is_contiguous());
  int const hidden_size = input.size(-1);
  int const num_tokens = input.numel() / hidden_size;
  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] {
        dynamic_scaled_int8_quant_impl(
            input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
            scale.data_ptr<float>(), num_tokens, hidden_size);
      });
 }
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -4,7 +4,12 @@
 #include <torch/library.h>
-void init_cpu_threads_env(const std::string& cpu_ids);
+std::string init_cpu_threads_env(const std::string& cpu_ids);
 void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
                    const torch::Tensor& b, const torch::Tensor& a_scales,
                    const torch::Tensor& b_scales,
                    const c10::optional<torch::Tensor>& bias);
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // vLLM custom ops
@@ -27,8 +32,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // PagedAttention V2.
  ops.def(
      "paged_attention_v2("
-      "    Tensor! out, Tensor exp_sums, Tensor max_logits,"
+      "    Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
-      "    Tensor tmp_out, Tensor query, Tensor key_cache,"
+      "    Tensor! tmp_out, Tensor query, Tensor key_cache,"
      "    Tensor value_cache, int num_kv_heads, float scale,"
      "    Tensor block_tables, Tensor seq_lens, int block_size,"
      "    int max_seq_len, Tensor? alibi_slopes,"
@@ -84,6 +89,28 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "                 Tensor! key, int head_size,"
      "                 Tensor cos_sin_cache, bool is_neox) -> ()");
  ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
  // Quantization
 #ifdef __AVX512F__
  // Compute int8 quantized tensor for given scaling factor.
  ops.def(
      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale) -> "
      "()");
  ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
  // Compute int8 quantized tensor and scaling factor
  ops.def(
      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale) -> "
      "()");
  ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
           &dynamic_scaled_int8_quant);
  // W8A8 GEMM, supporting symmetric per-tensor or per-row/column
  // quantization.
  ops.def(
      "cutlass_scaled_mm(Tensor! out, Tensor a,"
      "                  Tensor b, Tensor a_scales,"
      "                  Tensor b_scales, Tensor? bias) -> ()");
  ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
 #endif
 }
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
@@ -95,8 +122,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
  // Copy the cache blocks from src to dst.
  cache_ops.def(
-      "copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
+      "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
-      "block_mapping) -> ()");
+      "Tensor block_mapping) -> ()");
  cache_ops.impl("copy_blocks", torch::kCPU, &copy_blocks);
  // Reshape the key and value tensors and cache them.
@@ -111,7 +138,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
  // CPU utils
-  utils.def("init_cpu_threads_env(str cpu_ids) -> ()", &init_cpu_threads_env);
+  utils.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env);
 }
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -5,7 +5,7 @@
 #include "cpu_types.hpp"
-void init_cpu_threads_env(const std::string& cpu_ids) {
+std::string init_cpu_threads_env(const std::string& cpu_ids) {
  bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
  TORCH_CHECK(omp_cpu_mask->size > 0);
  std::vector<int> omp_cpu_ids;
@@ -51,15 +51,40 @@ void init_cpu_threads_env(const std::string& cpu_ids) {
  torch::set_num_threads((int)omp_cpu_ids.size());
  TORCH_CHECK_EQ(omp_cpu_ids.size(), torch::get_num_threads());
  TORCH_CHECK_EQ(omp_cpu_ids.size(), omp_get_max_threads());
  std::vector<std::pair<int, int>> thread_core_mapping;
  thread_core_mapping.reserve(omp_cpu_ids.size());
  omp_lock_t writelock;
  omp_init_lock(&writelock);
 #pragma omp parallel for schedule(static, 1)
  for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
-    cpu_set_t* mask = CPU_ALLOC(omp_cpu_mask->size);
+    cpu_set_t mask;
-    size_t size = CPU_ALLOC_SIZE(omp_cpu_mask->size);
+    CPU_ZERO(&mask);
-    CPU_ZERO_S(size, mask);
+    CPU_SET(omp_cpu_ids[i], &mask);
-    CPU_SET_S(omp_cpu_ids[i], size, mask);
+    int ret = sched_setaffinity(0, sizeof(cpu_set_t), &mask);
-    sched_setaffinity(0, sizeof(cpu_set_t), mask);
+    if (ret == -1) {
-    CPU_FREE(mask);
+      TORCH_CHECK(false,
                  "sched_setaffinity failed. errno: " + std::to_string(errno));
    }
    omp_set_lock(&writelock);
    thread_core_mapping.emplace_back(gettid(), omp_cpu_ids[i]);
    omp_unset_lock(&writelock);
  }
  omp_destroy_lock(&writelock);
  numa_free_nodemask(omp_cpu_mask);
  std::stringstream ss;
  ss << "OMP threads binding of Process " << getpid() << ":\n";
  std::sort(thread_core_mapping.begin(), thread_core_mapping.end(),
            [](auto&& a, auto&& b) { return a.second < b.second; });
  for (auto&& item : thread_core_mapping) {
    ss << "\t"
       << "OMP tid: " << item.first << ", core " << item.second << "\n";
  }
  return ss.str();
 }
--- a/csrc/moe/marlin_moe_ops.cu
+++ b/csrc/moe/marlin_moe_ops.cu
@@ -1737,4 +1737,4 @@ torch::Tensor marlin_gemm_moe(
      moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
      thread_n, sms, max_par, replicate_input, apply_weights);
  return c;
-}
+}
--- a/csrc/moe/marlin_moe_ops.h
+++ b/csrc/moe/marlin_moe_ops.h
@@ -9,4 +9,4 @@ torch::Tensor marlin_gemm_moe(
    const torch::Tensor& g_idx, const torch::Tensor& perm,
    torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
    bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size,
-    bool replicate_input, bool apply_weights);
+    bool replicate_input, bool apply_weights);
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -16,7 +16,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "g_idx, Tensor! perm, Tensor! workspace, int size_m, int size_n, int "
      "size_k, bool is_k_full, int num_experts, int topk, int moe_block_size, "
      "bool replicate_input, bool apply_weights) -> Tensor");
  m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
 #endif
 }
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -123,9 +123,17 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                 int64_t size_k, int64_t size_n,
                                 int64_t num_bits);
 torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
                                      torch::Tensor& perm, c10::SymInt size_k,
                                      c10::SymInt size_n, int64_t num_bits);
 torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
                                int64_t size_n, int64_t num_bits);
 torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
                                     c10::SymInt size_k, c10::SymInt size_n,
                                     int64_t num_bits);
 torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
                              int64_t n);
@@ -170,9 +178,6 @@ void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
 void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
                               torch::Tensor& scales);
 void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
                     torch::Tensor lookup_table);
 torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                        torch::Tensor b_gptq_qzeros,
                        torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
--- a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
@@ -267,3 +267,15 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
 }
 #endif
 torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
                                     c10::SymInt size_k, c10::SymInt size_n,
                                     int64_t num_bits) {
  int const pack_factor = 32 / num_bits;
  auto options = torch::TensorOptions()
                     .dtype(b_q_weight.dtype())
                     .device(b_q_weight.device());
  return torch::empty_symint(
      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
      options);
 }
--- a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
@@ -342,3 +342,15 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
 }
 #endif
 torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
                                      torch::Tensor& perm, c10::SymInt size_k,
                                      c10::SymInt size_n, int64_t num_bits) {
  int const pack_factor = 32 / num_bits;
  auto options = torch::TensorOptions()
                     .dtype(b_q_weight.dtype())
                     .device(b_q_weight.device());
  return torch::empty_symint(
      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
      options);
 }
--- a/csrc/quantization/squeezellm/quant_cuda_kernel.cu
+++ b/csrc/quantization/squeezellm/quant_cuda_kernel.cu
@@ -1,216 +0,0 @@
 #include <torch/all.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
 // half-tensor
 #include <c10/cuda/CUDAStream.h>
 #include <ATen/cuda/CUDATensorMethods.cuh>
 #include <c10/cuda/CUDAGuard.h>
 #define BLOCKWIDTH 128
 #define BLOCKHEIGHT4 16
 namespace vllm {
 namespace squeezellm {
 __device__ inline unsigned int as_unsigned(int i) {
  return *reinterpret_cast<unsigned int*>(&i);
 }
 // 4-bit matvec kernel (LUT-based)
 __global__ void NUQ4MatMulKernel(
 #ifndef USE_ROCM
    const half2* __restrict__ vec,
 #else
    const __half2* __restrict__ vec,
 #endif
    const int* __restrict__ mat,
 #ifndef USE_ROCM
    half2* __restrict__ mul,
 #else
    float2* __restrict__ mul,
 #endif
    const __half* __restrict__ lookup_table, int height, int width, int batch,
    int vec_height) {
  const int blockwidth2 = BLOCKWIDTH / 2;
  int row = BLOCKHEIGHT4 * blockIdx.x;
  int col = BLOCKWIDTH * blockIdx.y + threadIdx.x;
 #ifndef USE_ROCM
  __shared__ half2 blockvec[blockwidth2];
 #else
  __shared__ __half2 blockvec[blockwidth2];
 #endif
  __shared__ __half deq2[16][BLOCKWIDTH];
  int off = threadIdx.x;
  int column_offset = col * 16;
  for (int val = 0; val < 16; val += 1) {
    int lut_index = column_offset + val;
    deq2[val][off] = lookup_table[lut_index];
  }
  __half res;
 #ifndef USE_ROCM
  half2 res2;
  half2 tmp2;
 #else
  __half2 res2;
  __half2 tmp2;
 #endif
  int i;
  int k;
  unsigned int tmp1;
  unsigned int lut_index1, lut_index2;
  for (int b = 0; b < batch; ++b) {
    i = width * row + col;
    res = __int2half_rd(0);
    k = 0;
    __syncthreads();
    if (threadIdx.x < blockwidth2)
      blockvec[threadIdx.x] =
          vec[b * vec_height / 2 + (row / BLOCKHEIGHT4) * blockwidth2 +
              threadIdx.x];
    __syncthreads();
    while (k < blockwidth2) {
      tmp1 = as_unsigned(mat[i]);
 #ifndef USE_ROCM
      res2 = {};
      tmp2 = {};
 #else
      res2.x = __half_as_ushort(__float2half(0));
      res2.y = __half_as_ushort(__float2half(0));
      tmp2.x = __half_as_ushort(__float2half(0));
      tmp2.y = __half_as_ushort(__float2half(0));
 #endif
      lut_index1 = tmp1 & 0xF;
      lut_index2 = (tmp1 >> 4) & 0xF;
 #ifndef USE_ROCM
      tmp2.x = deq2[lut_index1][off];
      tmp2.y = deq2[lut_index2][off];
 #else
      tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
      tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
 #endif
      res2 = __hfma2(tmp2, blockvec[k + 0], res2);
      lut_index1 = (tmp1 >> 8) & 0xF;
      lut_index2 = (tmp1 >> 12) & 0xF;
 #ifndef USE_ROCM
      tmp2.x = deq2[lut_index1][off];
      tmp2.y = deq2[lut_index2][off];
 #else
      tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
      tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
 #endif
      res2 = __hfma2(tmp2, blockvec[k + 1], res2);
      lut_index1 = (tmp1 >> 16) & 0xF;
      lut_index2 = (tmp1 >> 20) & 0xF;
 #ifndef USE_ROCM
      tmp2.x = deq2[lut_index1][off];
      tmp2.y = deq2[lut_index2][off];
 #else
      tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
      tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
 #endif
      res2 = __hfma2(tmp2, blockvec[k + 2], res2);
      lut_index1 = (tmp1 >> 24) & 0xF;
      lut_index2 = (tmp1 >> 28) & 0xF;
 #ifndef USE_ROCM
      tmp2.x = deq2[lut_index1][off];
      tmp2.y = deq2[lut_index2][off];
 #else
      tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
      tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
 #endif
      res2 = __hfma2(tmp2, blockvec[k + 3], res2);
 #ifndef USE_ROCM
      res = __hadd(__hadd(res2.x, res2.y), res);
 #else
      res = __hadd(__hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)),
                   res);
 #endif
      i += width;
      k += 4;
    }
    // col%2 -> only set one of the two values
 #ifndef USE_ROCM
    half2 res3 = {};
    if (col % 2 == 0) {
      res3.x = res;
    } else {
      res3.y = res;
    }
 #else
    __half2 res3;
    res3.x = __half_as_ushort(__float2half(0));
    res3.y = __half_as_ushort(__float2half(0));
    if (col % 2 == 0) {
      res3.x = __half_as_ushort(res);
    } else {
      res3.y = __half_as_ushort(res);
    }
 #endif
 #ifndef USE_ROCM
    atomicAdd(&mul[b * width / 2 + col / 2], res3);
 #else
    int tmp_addr = b * width / 2 + col / 2;
    atomicAdd(&(mul[tmp_addr].x), __half2float(__ushort_as_half(res3.x)));
    atomicAdd(&(mul[tmp_addr].y), __half2float(__ushort_as_half(res3.y)));
 #endif
  }
 }
 }  // namespace squeezellm
 }  // namespace vllm
 // 4-bit matvec kernel (LUT-based)
 void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
                     torch::Tensor lookup_table) {
  int height = mat.size(0);
  int width = mat.size(1);
  int batch = vec.size(0);
  int vec_height = vec.size(1);
  dim3 blocks((height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4,
              (width + BLOCKWIDTH - 1) / BLOCKWIDTH);
  dim3 threads(BLOCKWIDTH);
  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  vllm::squeezellm::NUQ4MatMulKernel<<<blocks, threads, 0, stream>>>(
 #ifndef USE_ROCM
      (half2*)vec.data_ptr<at::Half>(),
 #else
      (__half2*)vec.data_ptr<at::Half>(),
 #endif
      mat.data_ptr<int>(),
 #ifndef USE_ROCM
      (half2*)mul.data_ptr<at::Half>(),
      (__half*)lookup_table.data_ptr<at::Half>(),
 #else
      (float2*)mul.data_ptr<float>(),
      (__half*)lookup_table.data_ptr<at::Half>(),
 #endif
      height, width, batch, vec_height);
 }
 #undef BLOCKWIDTH
 #undef BLOCKHEIGHT4
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -36,8 +36,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // PagedAttention V2.
  ops.def(
      "paged_attention_v2("
-      "    Tensor! out, Tensor exp_sums, Tensor max_logits,"
+      "    Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
-      "    Tensor tmp_out, Tensor query, Tensor key_cache,"
+      "    Tensor! tmp_out, Tensor query, Tensor key_cache,"
      "    Tensor value_cache, int num_kv_heads, float scale,"
      "    Tensor block_tables, Tensor seq_lens, int block_size,"
      "    int max_seq_len, Tensor? alibi_slopes,"
@@ -73,7 +73,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.impl("gelu_quick", torch::kCUDA, &gelu_quick);
  // prepare_inputs advance_step
-  ops.def("advance_step", &advance_step);
+  ops.def(
      "advance_step(int num_seqs, int num_queries, int block_size, "
      "Tensor! input_tokens, Tensor sampled_token_ids, "
      "Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping, "
      "Tensor block_tables) -> ()");
  ops.impl("advance_step", torch::kCUDA, &advance_step);
  // Layernorm
@@ -110,27 +114,56 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // Quantization ops
 #ifndef USE_ROCM
  // Quantized GEMM for AQLM.
-  ops.def("aqlm_gemm", &aqlm_gemm);
+  ops.def(
      "aqlm_gemm(Tensor input, Tensor codes, Tensor codebooks, "
      "Tensor scales, int[] codebook_partition_sizes, Tensor? bias) "
      "-> Tensor");
  ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm);
  // Decompression method for AQLM.
-  ops.def("aqlm_dequant", &aqlm_dequant);
+  ops.def(
      "aqlm_dequant(Tensor codes, Tensor codebooks, "
      "int[] codebook_partition_sizes) -> Tensor");
  ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant);
  // Quantized GEMM for AWQ.
-  ops.def("awq_gemm", &awq_gemm);
+  ops.def(
      "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, "
      "Tensor _zeros, int split_k_iters) -> Tensor");
  ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
  // Dequantization for AWQ.
-  ops.def("awq_dequantize", &awq_dequantize);
+  ops.def(
      "awq_dequantize(Tensor _kernel, Tensor _scaling_factors, "
      "Tensor _zeros, int split_k_iters, int thx, int thy) -> Tensor");
  ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
  // Note about marlin kernel 'workspace' arguments:
  // Technically these should be mutable since they are modified by the kernel.
  // But since they are set back to zero once the kernel is finished we can
  // hand wave and say that they have no net effect.
  //
  // The reason to mark 'workspace' as immutable is so that they don't interfere
  // with using ScalarType arguments in the ops. If they are marked as mutable,
  // pytorch throws an assert in
  // 'torch._higher_order_ops._register_effectful_op' that prevents these
  // kernels from being torch.compile'd.
  // See the following document for more info on custom types and ops that use
  // custom types:
  // https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA
  // Marlin (Dense) Optimized Quantized GEMM for GPTQ.
-  ops.def("marlin_gemm", &marlin_gemm);
+  ops.def(
      "marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
      "Tensor! workspace, int size_m, int size_n, int size_k) -> Tensor");
  ops.impl("marlin_gemm", torch::kCUDA, &marlin_gemm);
  // Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
-  ops.def("gptq_marlin_24_gemm", &gptq_marlin_24_gemm);
+  ops.def(
      "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, "
      "Tensor b_scales, Tensor workspace, "
      "__torch__.torch.classes._core_C.ScalarType b_q_type, "
      "int size_m, int size_n, int size_k) -> Tensor");
  ops.impl("gptq_marlin_24_gemm", torch::kCUDA, &gptq_marlin_24_gemm);
  // Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
@@ -149,35 +182,55 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.impl("machete_prepack_B", torch::kCUDA, &machete::prepack_B);
  // gptq_marlin Optimized Quantized GEMM for GPTQ.
-  ops.def("gptq_marlin_gemm", &gptq_marlin_gemm);
+  ops.def(
      "gptq_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
      "Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, "
      "__torch__.torch.classes._core_C.ScalarType b_q_type, "
      "int size_m, int size_n, int size_k, bool is_k_full, "
      "bool has_zp, bool use_fp32_reduce) -> Tensor");
  ops.impl("gptq_marlin_gemm", torch::kCUDA, &gptq_marlin_gemm);
  // gptq_marlin repack from GPTQ.
-  ops.def("gptq_marlin_repack", &gptq_marlin_repack);
+  ops.def(
      "gptq_marlin_repack(Tensor b_q_weight, Tensor perm, "
      "SymInt size_k, SymInt size_n, int num_bits) -> Tensor");
  ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack);
  ops.impl("gptq_marlin_repack", torch::kMeta, &gptq_marlin_repack_meta);
  // awq_marlin repack from AWQ.
-  ops.def("awq_marlin_repack", &awq_marlin_repack);
+  ops.def(
      "awq_marlin_repack(Tensor b_q_weight, SymInt size_k, "
      "SymInt size_n, int num_bits) -> Tensor");
  ops.impl("awq_marlin_repack", torch::kCUDA, &awq_marlin_repack);
  ops.impl("awq_marlin_repack", torch::kMeta, &awq_marlin_repack_meta);
  // Dequantization for GGML.
-  ops.def("ggml_dequantize", &ggml_dequantize);
+  ops.def("ggml_dequantize(Tensor W, int type, int m, int n) -> Tensor");
  ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);
  // mmvq kernel for GGML.
-  ops.def("ggml_mul_mat_vec_a8", &ggml_mul_mat_vec_a8);
+  ops.def(
      "ggml_mul_mat_vec_a8(Tensor W, Tensor X, int type, int row) "
      "-> Tensor");
  ops.impl("ggml_mul_mat_vec_a8", torch::kCUDA, &ggml_mul_mat_vec_a8);
  // mmq kernel for GGML.
-  ops.def("ggml_mul_mat_a8", &ggml_mul_mat_a8);
+  ops.def("ggml_mul_mat_a8(Tensor W, Tensor X, int type, int row) -> Tensor");
  ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8);
  // fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
-  ops.def("fp8_marlin_gemm", &fp8_marlin_gemm);
+  ops.def(
      "fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
      "Tensor! workspace, int num_bits, int size_m, int size_n, "
      "int size_k) -> Tensor");
  ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm);
  // marlin_qqq_gemm for QQQ.
-  ops.def("marlin_qqq_gemm", &marlin_qqq_gemm);
+  ops.def(
      "marlin_qqq_gemm(Tensor a, Tensor b_q_weight, "
      "Tensor s_tok, Tensor s_ch, Tensor s_group, "
      "Tensor! workspace, int size_m, int size_n, "
      "int size_k) -> Tensor");
  ops.impl("marlin_qqq_gemm", torch::kCUDA, &marlin_qqq_gemm);
  // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
@@ -199,16 +252,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // Check if cutlass scaled_mm is supported for CUDA devices of the given
  // capability
-  ops.def("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
+  ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
-  ops.impl("cutlass_scaled_mm_supports_fp8", torch::kCUDA,
+  ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
-           &cutlass_scaled_mm_supports_fp8);
+
  // Mamba selective scan kernel
  ops.def(
      "selective_scan_fwd(Tensor! u, Tensor! delta,"
      "Tensor! A, Tensor! B, Tensor! C,"
      "Tensor? D_, Tensor? z_, Tensor? delta_bias_,"
      "bool delta_softplus,"
-      "Tensor? index_, Tensor? x) -> Tensor[]");
+      "Tensor? index_, Tensor(a! -> *)? x) -> Tensor(a)[]");
  ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
  ops.def(
@@ -230,19 +283,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 #endif
  // Quantized GEMM for GPTQ.
-  ops.def("gptq_gemm", &gptq_gemm);
+  // Note: even though the C++ inferred schema is correct for this op, it seems
  // to prevent the meta function registry.
  ops.def(
      "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
      "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, int bit) "
      "-> Tensor");
  ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
  // Post processing for GPTQ.
  ops.def("gptq_shuffle(Tensor! q_weight, Tensor q_perm, int bit) -> ()");
  ops.impl("gptq_shuffle", torch::kCUDA, &gptq_shuffle);
  // Quantized GEMM for SqueezeLLM.
  ops.def(
      "squeezellm_gemm(Tensor vec, Tensor mat, Tensor! mul, Tensor "
      "lookup_table) -> ()");
  ops.impl("squeezellm_gemm", torch::kCUDA, &squeezellm_gemm);
  // Compute FP8 quantized tensor for given scaling factor.
  ops.def(
      "static_scaled_fp8_quant(Tensor! out, Tensor input, Tensor scale) -> ()");
@@ -256,8 +308,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // Compute dynamic-per-token FP8 quantized tensor and scaling factor.
  ops.def(
-      "dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, Tensor! "
+      "dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, "
-      "scale, Tensor? scale_ub) -> "
+      "Tensor! scale, Tensor? scale_ub) -> "
      "()");
  ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
           &dynamic_per_token_scaled_fp8_quant);
@@ -294,8 +346,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
  // Copy the cache blocks from src to dst.
  cache_ops.def(
-      "copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
+      "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
-      "block_mapping) -> ()");
+      "Tensor block_mapping) -> ()");
  cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);
  // Reshape the key and value tensors and cache them.
@@ -320,8 +372,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
  // Convert the key and value cache to fp8 data type.
  cache_ops.def(
-      "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, str "
+      "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
-      "kv_cache_dtype) -> ()");
+      "str kv_cache_dtype) -> ()");
  cache_ops.impl("convert_fp8", torch::kCUDA, &convert_fp8);
 }
@@ -329,24 +381,28 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
  // Cuda utils
  // Gets the specified device attribute.
-  cuda_utils.def("get_device_attribute", &get_device_attribute);
+  cuda_utils.def("get_device_attribute(int attribute, int device_id) -> int");
-  cuda_utils.impl("get_device_attribute", torch::kCUDA, &get_device_attribute);
+  cuda_utils.impl("get_device_attribute", &get_device_attribute);
  // Gets the maximum shared memory per block device attribute.
-  cuda_utils.def("get_max_shared_memory_per_block_device_attribute",
+  cuda_utils.def(
-                 &get_max_shared_memory_per_block_device_attribute);
+      "get_max_shared_memory_per_block_device_attribute(int device_id) -> int");
  cuda_utils.impl("get_max_shared_memory_per_block_device_attribute",
                  torch::kCUDA,
                  &get_max_shared_memory_per_block_device_attribute);
 }
 #ifndef USE_ROCM
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
  // Custom all-reduce kernels
-  custom_ar.def("init_custom_ar", &init_custom_ar);
+  custom_ar.def(
      "init_custom_ar(Tensor meta, Tensor rank_data, "
      "str[] handles, int[] offsets, int rank, "
      "bool full_nvlink) -> int");
  custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
-  custom_ar.def("should_custom_ar", &should_custom_ar);
+  custom_ar.def(
      "should_custom_ar(Tensor inp, int max_size, int world_size, "
      "bool full_nvlink) -> bool");
  custom_ar.impl("should_custom_ar", torch::kCUDA, &should_custom_ar);
  custom_ar.def("all_reduce_reg(int fa, Tensor inp, Tensor! out) -> ()");
@@ -358,21 +414,15 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
  custom_ar.impl("all_reduce_unreg", torch::kCUDA, &all_reduce_unreg);
  custom_ar.def("dispose", &dispose);
  custom_ar.impl("dispose", torch::kCPU, &dispose);
  custom_ar.def("meta_size", &meta_size);
  custom_ar.impl("meta_size", torch::kCPU, &meta_size);
-  custom_ar.def("register_buffer", &register_buffer);
+  custom_ar.def(
      "register_buffer(int fa, Tensor t, str[] handles, "
      "int[] offsets) -> ()");
  custom_ar.impl("register_buffer", torch::kCUDA, &register_buffer);
  custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
  custom_ar.impl("get_graph_buffer_ipc_meta", torch::kCPU,
                 &get_graph_buffer_ipc_meta);
  custom_ar.def("register_graph_buffers", &register_graph_buffers);
  custom_ar.impl("register_graph_buffers", torch::kCPU,
                 &register_graph_buffers);
 }
 #endif
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -11,6 +11,5 @@ pydantic >= 2.8
 torch
 py-cpuinfo
 transformers
 openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 mistral_common >= 1.3.4
 openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
--- a/docs/source/community/meetups.rst
+++ b/docs/source/community/meetups.rst
@@ -5,6 +5,7 @@ vLLM Meetups
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 - `The sixth vLLM meetup <https://lu.ma/87q3nvnh>`__, with NVIDIA, September 9th 2024. `[Slides] <https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing>`__
 - `The fifth vLLM meetup <https://lu.ma/lp0gyjqr>`__, with AWS, July 24th 2024. `[Slides] <https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing>`__
 - `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__
 - `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -99,6 +99,7 @@ autodoc_mock_imports = [
    "aiohttp",
    "compressed_tensors",
    "cpuinfo",
    "cv2",
    "torch",
    "transformers",
    "psutil",
--- a/docs/source/dev/profiling/profiling_index.rst
+++ b/docs/source/dev/profiling/profiling_index.rst
@@ -17,14 +17,28 @@ Traces can be visualized using https://ui.perfetto.dev/.
 .. tip::
   Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
-   
+
-Example commands:
+.. tip::
   To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
   Set the env variable VLLM_RPC_GET_DATA_TIMEOUT_MS to a big number before you start the server. Say something like 30 minutes.
   ``export VLLM_RPC_GET_DATA_TIMEOUT_MS=1800000``
 Example commands and usage:
 ===========================
 Offline Inference:
 ------------------
 Refer to `examples/offline_inference_with_profiler.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_with_profiler.py>`_ for an example.
 OpenAI Server:
 --------------
 .. code-block:: bash
-    VLLM_TORCH_PROFILER_DIR=/mnt/traces/ python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B 
+    VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B 
 benchmark_serving.py:
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -21,7 +21,7 @@ If you have already taken care of the above issues, but the vLLM instance still
 With more logging, hopefully you can find the root cause of the issue.
-If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the ``LLM`` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error.
+If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the :class:`~vllm.LLM` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error.
 Here are some common issues that can cause hangs:
--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
@@ -24,7 +24,9 @@ Offline Batched Inference
 We first show an example of using vLLM for offline batched inference on a dataset. In other words, we use vLLM to generate texts for a list of input prompts.
-Import ``LLM`` and ``SamplingParams`` from vLLM. The ``LLM`` class is the main class for running offline inference with vLLM engine. The ``SamplingParams`` class specifies the parameters for the sampling process.
+Import :class:`~vllm.LLM` and :class:`~vllm.SamplingParams` from vLLM.
 The :class:`~vllm.LLM` class is the main class for running offline inference with vLLM engine.
 The :class:`~vllm.SamplingParams` class specifies the parameters for the sampling process.
 .. code-block:: python
@@ -42,7 +44,7 @@ Define the list of input prompts and the sampling parameters for generation. The
    ]
    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-Initialize vLLM's engine for offline inference with the ``LLM`` class and the `OPT-125M model <https://arxiv.org/abs/2205.01068>`_. The list of supported models can be found at :ref:`supported models <supported_models>`.
+Initialize vLLM's engine for offline inference with the :class:`~vllm.LLM` class and the `OPT-125M model <https://arxiv.org/abs/2205.01068>`_. The list of supported models can be found at :ref:`supported models <supported_models>`.
 .. code-block:: python
--- a/docs/source/models/lora.rst
+++ b/docs/source/models/lora.rst
@@ -107,3 +107,55 @@ The following is an example request
            "max_tokens": 7,
            "temperature": 0
        }' | jq
 Dynamically serving LoRA Adapters
 ---------------------------------
 In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading
 LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility
 to change models on-the-fly is needed.
 Note: Enabling this feature in production environments is risky as user may participate model adapter management.
 To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
 is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active.
 .. code-block:: bash
    export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
 Loading a LoRA Adapter:
 To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary
 details of the adapter to be loaded. The request payload should include the name and path to the LoRA adapter.
 Example request to load a LoRA adapter:
 .. code-block:: bash
    curl -X POST http://localhost:8000/v1/load_lora_adapter \
    -H "Content-Type: application/json" \
    -d '{
        "lora_name": "sql_adapter",
        "lora_path": "/path/to/sql-lora-adapter"
    }'
 Upon a successful request, the API will respond with a 200 OK status code. If an error occurs, such as if the adapter
 cannot be found or loaded, an appropriate error message will be returned.
 Unloading a LoRA Adapter:
 To unload a LoRA adapter that has been previously loaded, send a POST request to the `/v1/unload_lora_adapter` endpoint
 with the name or ID of the adapter to be unloaded.
 Example request to unload a LoRA adapter:
 .. code-block:: bash
    curl -X POST http://localhost:8000/v1/unload_lora_adapter \
    -H "Content-Type: application/json" \
    -d '{
        "lora_name": "sql_adapter"
    }'
--- a/docs/source/models/spec_decode.rst
+++ b/docs/source/models/spec_decode.rst
@@ -161,6 +161,46 @@ A variety of speculative models of this type are available on HF hub:
 * `granite-7b-instruct-accelerator <https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator>`_
 * `granite-20b-code-instruct-accelerator <https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator>`_
 Lossless guarantees of Speculative Decoding
 -------------------------------------------
 In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of 
 speculative decoding, breaking down the guarantees into three key areas:
 1. **Theoretical Losslessness**
   - Speculative decoding sampling is theoretically lossless up to the precision limits of hardware numerics. Floating-point errors might 
   cause slight variations in output distributions, as discussed 
   in `Accelerating Large Language Model Decoding with Speculative Sampling <https://arxiv.org/pdf/2302.01318>`_
 2. **Algorithmic Losslessness**
   - vLLM’s implementation of speculative decoding is algorithmically validated to be lossless. Key validation tests include:
    - **Rejection Sampler Convergence**: Ensures that samples from vLLM’s rejection sampler align with the target 
      distribution. `View Test Code <https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252>`_
    - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling
      without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler, 
      provides a lossless guarantee.  Almost all of the tests in `this directory <https://github.com/vllm-project/vllm/tree/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e>`_
      verify this property using `this assertion implementation <https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291>`_
 3. **vLLM Logprob Stability**
   - vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the 
   same request across runs. For more details, see the FAQ section 
   titled *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq.rst>`_.
 **Conclusion**
 While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding 
 can occur due to following factors:
 - **Floating-Point Precision**: Differences in hardware numerical precision may lead to slight discrepancies in the output distribution.
 - **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially 
  due to non-deterministic behavior in batched operations or numerical instability.
 **Mitigation Strategies**
 For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq.rst>`_.
 Resources for vLLM contributors
 -------------------------------
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -194,12 +194,12 @@ Multimodal Language Models
  * - Architecture
    - Models
-    - Supported Modalities
+    - Modalities
    - Example HuggingFace Models
    - :ref:`LoRA <lora>`
  * - :code:`Blip2ForConditionalGeneration`
    - BLIP-2
-    - Image
+    - Image\ :sup:`E`
    - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc.
    -
  * - :code:`ChameleonForConditionalGeneration`
@@ -214,44 +214,75 @@ Multimodal Language Models
    - 
  * - :code:`InternVLChatModel`
    - InternVL2
-    - Image
+    - Image\ :sup:`E+`
    - :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc.
    - 
  * - :code:`LlavaForConditionalGeneration`
    - LLaVA-1.5
-    - Image
+    - Image\ :sup:`E+`
    - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
    -
  * - :code:`LlavaNextForConditionalGeneration`
    - LLaVA-NeXT
-    - Image
+    - Image\ :sup:`E+`
    - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
    -
  * - :code:`LlavaNextVideoForConditionalGeneration`
    - LLaVA-NeXT-Video
    - Video
    - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. (see note)
    -
  * - :code:`MiniCPMV`
    - MiniCPM-V
    - Image\ :sup:`+`
    - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc.
    -
  * - :code:`PaliGemmaForConditionalGeneration`
    - PaliGemma
-    - Image
+    - Image\ :sup:`E`
    - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc.
    - 
  * - :code:`Phi3VForCausalLM`
    - Phi-3-Vision, Phi-3.5-Vision
-    - Image
+    - Image\ :sup:`E+`
    - :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc.
    -
-  * - :code:`MiniCPMV`
+  * - :code:`PixtralForConditionalGeneration`
-    - MiniCPM-V
+    - Pixtral
-    - Image
+    - Image\ :sup:`+`
-    - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc.
+    - :code:`mistralai/Pixtral-12B-2409`
    -
  * - :code:`QWenLMHeadModel`
    - Qwen-VL
    - Image\ :sup:`E`
    - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
    -
  * - :code:`Qwen2VLForConditionalGeneration`
    - Qwen2-VL (see note)
    - Image\ :sup:`+` / Video\ :sup:`+`
    - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
    -
  * - :code:`UltravoxModel`
    - Ultravox
-    - Audio
+    - Audio\ :sup:`E+`
    - :code:`fixie-ai/ultravox-v0_3`
    -
 | :sup:`E` Pre-computed embeddings can be inputted for this modality.
 | :sup:`+` Multiple items can be inputted per text prompt for this modality.
 .. note::
  For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
  For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
 .. note::
  For :code:`LLaVA-NeXT-Video` and :code:`Qwen2-VL`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now.
  This can be installed by running the following command: 
  .. code-block:: bash
    pip install git+https://github.com/huggingface/transformers.git@21fac7abba2a37fae86106f87fcf9974fd1e3830
 ----
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -9,26 +9,23 @@ This document shows you how to run and serve these models using vLLM.
 .. important::
    We are actively iterating on VLM support. Expect breaking changes to VLM usage and development in upcoming releases without prior deprecation.
    Currently, the support for vision language models on vLLM has the following limitations:
    * Only single image input is supported per text prompt.
    We are continuously improving user & developer experience for VLMs. Please `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_ if you have any feedback or feature requests.
-Offline Batched Inference
+Offline Inference
-------------------------
+-----------------
-To initialize a VLM, the aforementioned arguments must be passed to the ``LLM`` class for instantiating the engine.
+Single-image input
 ^^^^^^^^^^^^^^^^^^
 The :class:`~vllm.LLM` class can be instantiated in much the same way as language-only models.
 .. code-block:: python
    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
-.. important::
+.. note::
    We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
-    the above snippet. Specifically, ``image_feature_size`` is no longer required to be specified as we now calculate that
+    the above snippet. Specifically, ``image_feature_size`` can no longer be specified as we now calculate that internally for each model.
    internally for each model.
 To pass an image to the model, note the following in :class:`vllm.inputs.PromptInputs`:
@@ -86,61 +83,117 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptI
 A code example can be found in `examples/offline_inference_vision_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py>`_.
 Multi-image input
 ^^^^^^^^^^^^^^^^^
-Online OpenAI Vision API Compatible Inference
+Multi-image input is only supported for a subset of VLMs, as shown :ref:`here <supported_vlms>`.
----------------------------------------------
+
 To enable multiple multi-modal items per text prompt, you have to set ``limit_mm_per_prompt`` for the :class:`~vllm.LLM` class.
 .. code-block:: python
    llm = LLM(
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,  # Required to load Phi-3.5-vision
        max_model_len=4096,  # Otherwise, it may not fit in smaller GPUs
        limit_mm_per_prompt={"image": 2},  # The maximum number to accept
    )
 Instead of passing in a single image, you can pass in a list of images.
 .. code-block:: python
    # Refer to the HuggingFace repo for the correct format to use
    prompt = "<|user|>\n<image_1>\n<image_2>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
    # Load the images using PIL.Image
    image1 = PIL.Image.open(...)
    image2 = PIL.Image.open(...)
    outputs = llm.generate({
        "prompt": prompt,
        "multi_modal_data": {
            "image": [image1, image2]
        },
    })
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
 A code example can be found in `examples/offline_inference_vision_language_multi_image.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_multi_image.py>`_.
 Online Inference
 ----------------
 OpenAI Vision API
 ^^^^^^^^^^^^^^^^^
 You can serve vision language models with vLLM's HTTP server that is compatible with `OpenAI Vision API <https://platform.openai.com/docs/guides/vision>`_.
-.. note::
+Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruct`` with vLLM's OpenAI-compatible API server.
    Currently, vLLM supports only **single** ``image_url`` input per ``messages``. Support for multi-image inputs will be
    added in the future.
 Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with vLLM API server.
 .. important::
    Since OpenAI Vision API is based on `Chat <https://platform.openai.com/docs/api-reference/chat>`_ API, a chat template 
    is **required** to launch the API server if the model's tokenizer does not come with one. In this example, we use the 
    HuggingFace Llava chat template that you can find in the example folder `here <https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja>`_.
 .. code-block:: bash
-    vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
+    vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
      --trust-remote-code --limit-mm-per-prompt image=2
 .. important::
-    We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
+    Since OpenAI Vision API is based on `Chat Completions <https://platform.openai.com/docs/api-reference/chat>`_ API,
-    the above snippet. Specifically, ``image_feature_size`` is no longer required to be specified as we now calculate that
+    a chat template is **required** to launch the API server.
-    internally for each model.
+
    Although Phi-3.5-Vision comes with a chat template, for other models you may have to provide one if the model's tokenizer does not come with it.
    The chat template can be inferred based on the documentation on the model's HuggingFace repo.
    For example, LLaVA-1.5 (``llava-hf/llava-1.5-7b-hf``) requires a chat template that can be found `here <https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja>`_.
 To consume the server, you can use the OpenAI client like in the example below:
 .. code-block:: python
    from openai import OpenAI
    openai_api_key = "EMPTY"
    openai_api_base = "http://localhost:8000/v1"
    client = OpenAI(
        api_key=openai_api_key,
        base_url=openai_api_base,
    )
    # Single-image input inference
    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
    chat_response = client.chat.completions.create(
-        model="llava-hf/llava-1.5-7b-hf",
+        model="microsoft/Phi-3.5-vision-instruct",
        messages=[{
            "role": "user",
            "content": [
                # NOTE: The prompt formatting with the image token `<image>` is not needed
                # since the prompt will be processed automatically by the API server.
-                {"type": "text", "text": "What's in this image?"},
+                {"type": "text", "text": "What’s in this image?"},
-                {
+                {"type": "image_url", "image_url": {"url": image_url}},
                    "type": "image_url",
                    "image_url": {
                        "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
                    },
                },
            ],
        }],
    )
-    print("Chat response:", chat_response)
+    print("Chat completion output:", chat_response.choices[0].message.content)
    # Multi-image input inference
    image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
    image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
    chat_response = client.chat.completions.create(
        model="microsoft/Phi-3.5-vision-instruct",
        messages=[{
            "role": "user",
            "content": [
                {"type": "text", "text": "What are the animals in these images?"},
                {"type": "image_url", "image_url": {"url": image_url_duck}},
                {"type": "image_url", "image_url": {"url": image_url_lion}},
            ],
        }],
    )
    print("Chat completion output:", chat_response.choices[0].message.content)
 A full code example can be found in `examples/openai_vision_api_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_vision_api_client.py>`_.
--- a/docs/source/quantization/supported_hardware.rst
+++ b/docs/source/quantization/supported_hardware.rst
@@ -119,17 +119,6 @@ The table below shows the compatibility of various quantization implementations
     - ✗
     - ✗
     - ✗
   * - SqueezeLLM
     - ✅︎
     - ✅︎
     - ✅︎
     - ✅︎
     - ✅︎
     - ✗
     - ✗
     - ✗
     - ✗
     - ✗
 Notes:
 ^^^^^^
--- a/docs/source/serving/faq.rst
+++ b/docs/source/serving/faq.rst
@@ -10,3 +10,22 @@ A: Assuming that you're referring to using OpenAI compatible server to serve mul
    Q: Which model to use for offline inference embedding?
 A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model
 ----------------------------------------
    Q: Can the output of a prompt vary across runs in vLLM?
 A: Yes, it can. vLLM does not guarantee stable log probabilities (logprobs) for the output tokens. Variations in logprobs may occur due to
 numerical instability in Torch operations or non-deterministic behavior in batched Torch operations when batching changes. For more details, 
 see the `Numerical Accuracy section <https://pytorch.org/docs/stable/notes/numerical_accuracy.html#batched-computations-or-slice-computations>`_.
 In vLLM, the same requests might be batched differently due to factors such as other concurrent requests,
 changes in batch size, or batch expansion in speculative decoding. These batching variations, combined with numerical instability of Torch operations, 
 can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in 
 different tokens being sampled. Once a different token is sampled, further divergence is likely.
 **Mitigation Strategies**
 - For improved stability and reduced variance, use `float32`. Note that this will require more memory.
 - If using `bfloat16`, switching to `float16` can also help.
 - Using request seeds can aid in achieving more stable generation for temperature > 0, but discrepancies due to precision differences may still occur.
--- a/examples/fp8/README.md
+++ b/examples/fp8/README.md
@@ -62,7 +62,7 @@ This script evaluates the inference throughput of language models using various
 python3 benchmarks/benchmark_throughput.py --help 
 usage: benchmark_throughput.py [-h] [--backend {vllm,hf,mii}] [--dataset DATASET] [--input-len INPUT_LEN] [--output-len OUTPUT_LEN] [--model MODEL]
-                               [--tokenizer TOKENIZER] [--quantization {awq,gptq,squeezellm,None}] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--n N]
+                               [--tokenizer TOKENIZER] [--quantization {awq,gptq,None}] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--n N]
                               [--use-beam-search] [--num-prompts NUM_PROMPTS] [--seed SEED] [--hf-max-batch-size HF_MAX_BATCH_SIZE] [--trust-remote-code]
                               [--max-model-len MAX_MODEL_LEN] [--dtype {auto,half,float16,bfloat16,float,float32}] [--enforce-eager] [--kv-cache-dtype {auto,fp8}]
                               [--quantization-param-path KV_CACHE_quantization_param_path]
@@ -76,7 +76,7 @@ optional arguments:
  --output-len OUTPUT_LEN  Output length for each request. Overrides the output length from the dataset.
  --model MODEL
  --tokenizer TOKENIZER
-  --quantization {awq,gptq,squeezellm,None}, -q {awq,gptq,squeezellm,None}
+  --quantization {awq,gptq,None}, -q {awq,gptq,None}
  --tensor-parallel-size TENSOR_PARALLEL_SIZE, -tp TENSOR_PARALLEL_SIZE
  --n N  Number of generated sequences per prompt.
  --use-beam-search
--- a/examples/fp8/quantizer/README.md
+++ b/examples/fp8/quantizer/README.md
@@ -1,6 +1,6 @@
 ### Quantizer Utilities
-`quantize.py`: NVIDIA Quantization utilities using AMMO, ported from TensorRT-LLM:
+`quantize.py`: NVIDIA Quantization utilities using TensorRT-Model-Optimizer, ported
-`https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py`
+from TensorRT-LLM: [`examples/quantization/quantize.py`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py)
 ### Prerequisite
--- a/examples/offline_inference_pixtral.py
+++ b/examples/offline_inference_pixtral.py
@@ -0,0 +1,164 @@
 # ruff: noqa
 import argparse
 from vllm import LLM
 from vllm.sampling_params import SamplingParams
 # This script is an offline demo for running Pixtral.
 #
 # If you want to run a server/client setup, please follow this code:
 #
 # - Server:
 #
 # ```bash
 # vllm serve mistralai/Pixtral-12B-2409 --tokenizer_mode mistral --limit_mm_per_prompt 'image=4' --max_num_batched_tokens 16384
 # ```
 #
 # - Client:
 #
 # ```bash
 # curl --location 'http://<your-node-url>:8000/v1/chat/completions' \
 # --header 'Content-Type: application/json' \
 # --header 'Authorization: Bearer token' \
 # --data '{
 #     "model": "mistralai/Pixtral-12B-2409",
 #     "messages": [
 #       {
 #         "role": "user",
 #         "content": [
 #             {"type" : "text", "text": "Describe this image in detail please."},
 #             {"type": "image_url", "image_url": {"url": "https://s3.amazonaws.com/cms.ipressroom.com/338/files/201808/5b894ee1a138352221103195_A680%7Ejogging-edit/A680%7Ejogging-edit_hero.jpg"}},
 #             {"type" : "text", "text": "and this one as well. Answer in French."},
 #             {"type": "image_url", "image_url": {"url": "https://www.wolframcloud.com/obj/resourcesystem/images/a0e/a0ee3983-46c6-4c92-b85d-059044639928/6af8cfb971db031b.png"}}
 #         ]
 #       }
 #     ]
 #   }'
 # ```
 #
 # Usage:
 #     python demo.py simple
 #     python demo.py advanced
 def run_simple_demo():
    model_name = "mistralai/Pixtral-12B-2409"
    sampling_params = SamplingParams(max_tokens=8192)
    llm = LLM(model=model_name, tokenizer_mode="mistral")
    prompt = "Describe this image in one sentence."
    image_url = "https://picsum.photos/id/237/200/300"
    messages = [
        {
            "role":
            "user",
            "content": [
                {
                    "type": "text",
                    "text": prompt
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": image_url
                    }
                },
            ],
        },
    ]
    outputs = llm.chat(messages, sampling_params=sampling_params)
    print(outputs[0].outputs[0].text)
 def run_advanced_demo():
    model_name = "mistralai/Pixtral-12B-2409"
    max_img_per_msg = 5
    max_tokens_per_img = 4096
    sampling_params = SamplingParams(max_tokens=8192, temperature=0.7)
    llm = LLM(
        model=model_name,
        tokenizer_mode="mistral",
        limit_mm_per_prompt={"image": max_img_per_msg},
        max_num_batched_tokens=max_img_per_msg * max_tokens_per_img,
    )
    prompt = "Describe the following image."
    url_1 = "https://huggingface.co/datasets/patrickvonplaten/random_img/resolve/main/yosemite.png"
    url_2 = "https://picsum.photos/seed/picsum/200/300"
    url_3 = "https://picsum.photos/id/32/512/512"
    messages = [
        {
            "role":
            "user",
            "content": [
                {
                    "type": "text",
                    "text": prompt
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": url_1
                    }
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": url_2
                    }
                },
            ],
        },
        {
            "role": "assistant",
            "content": "The images show nature.",
        },
        {
            "role": "user",
            "content": "More details please and answer only in French!.",
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": url_3
                    }
                },
            ],
        },
    ]
    outputs = llm.chat(messages=messages, sampling_params=sampling_params)
    print(outputs[0].outputs[0].text)
 def main():
    parser = argparse.ArgumentParser(
        description="Run a demo in simple or advanced mode.")
    parser.add_argument(
        "mode",
        choices=["simple", "advanced"],
        help="Specify the demo mode: 'simple' or 'advanced'",
    )
    args = parser.parse_args()
    if args.mode == "simple":
        print("Running simple demo...")
        run_simple_demo()
    elif args.mode == "advanced":
        print("Running advanced demo...")
        run_advanced_demo()
 if __name__ == "__main__":
    main()
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -9,12 +9,9 @@ from transformers import AutoTokenizer
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.utils import FlexibleArgumentParser
 # Input image and question
 image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
 question = "What is the content of this image?"
 # LLaVA-1.5
 def run_llava(question):
@@ -30,7 +27,16 @@ def run_llava(question):
 def run_llava_next(question):
    prompt = f"[INST] <image>\n{question} [/INST]"
-    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf")
+    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
    stop_token_ids = None
    return llm, prompt, stop_token_ids
 # LlaVA-NeXT-Video
 # Currently only support for video input
 def run_llava_next_video(question):
    prompt = f"USER: <video>\n{question} ASSISTANT:"
    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
    stop_token_ids = None
    return llm, prompt, stop_token_ids
@@ -159,9 +165,41 @@ def run_blip2(question):
    return llm, prompt, stop_token_ids
 # Qwen
 def run_qwen_vl(question):
    llm = LLM(
        model="Qwen/Qwen-VL",
        trust_remote_code=True,
        max_num_seqs=5,
    )
    prompt = f"{question}Picture 1: <img></img>\n"
    stop_token_ids = None
    return llm, prompt, stop_token_ids
 # Qwen2-VL
 def run_qwen2_vl(question):
    model_name = "Qwen/Qwen2-VL-7B-Instruct"
    llm = LLM(
        model=model_name,
        max_num_seqs=5,
    )
    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
              "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
              f"{question}<|im_end|>\n"
              "<|im_start|>assistant\n")
    stop_token_ids = None
    return llm, prompt, stop_token_ids
 model_example_map = {
    "llava": run_llava,
    "llava-next": run_llava_next,
    "llava-next-video": run_llava_next_video,
    "fuyu": run_fuyu,
    "phi3_v": run_phi3v,
    "paligemma": run_paligemma,
@@ -169,14 +207,54 @@ model_example_map = {
    "minicpmv": run_minicpmv,
    "blip-2": run_blip2,
    "internvl_chat": run_internvl,
    "qwen_vl": run_qwen_vl,
    "qwen2_vl": run_qwen2_vl,
 }
 def get_multi_modal_input(args):
    """
    return {
        "data": image or video,
        "question": question,
    }
    """
    if args.modality == "image":
        # Input image and question
        image = ImageAsset("cherry_blossom") \
            .pil_image.convert("RGB")
        img_question = "What is the content of this image?"
        return {
            "data": image,
            "question": img_question,
        }
    if args.modality == "video":
        # Input video and question
        video = VideoAsset(name="sample_demo_1.mp4",
                           num_frames=args.num_frames).np_ndarrays
        vid_question = "Why is this video funny?"
        return {
            "data": video,
            "question": vid_question,
        }
    msg = f"Modality {args.modality} is not supported."
    raise ValueError(msg)
 def main(args):
    model = args.model_type
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")
    modality = args.modality
    mm_input = get_multi_modal_input(args)
    data = mm_input["data"]
    question = mm_input["question"]
    llm, prompt, stop_token_ids = model_example_map[model](question)
    # We set temperature to 0.2 so that outputs can be different
@@ -191,7 +269,7 @@ def main(args):
        inputs = {
            "prompt": prompt,
            "multi_modal_data": {
-                "image": image
+                modality: data
            },
        }
@@ -200,7 +278,7 @@ def main(args):
        inputs = [{
            "prompt": prompt,
            "multi_modal_data": {
-                "image": image
+                modality: data
            },
        } for _ in range(args.num_prompts)]
@@ -223,8 +301,15 @@ if __name__ == "__main__":
                        help='Huggingface "model_type".')
    parser.add_argument('--num-prompts',
                        type=int,
-                        default=1,
+                        default=4,
                        help='Number of prompts to run.')
-
+    parser.add_argument('--modality',
                        type=str,
                        default="image",
                        help='Modality of the input.')
    parser.add_argument('--num-frames',
                        type=int,
                        default=16,
                        help='Number of frames to extract from the video.')
    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -0,0 +1,207 @@
 """
 This example shows how to use vLLM for running offline inference with
 multi-image input on vision language models, using the chat template defined
 by the model.
 """
 from argparse import Namespace
 from typing import List
 from transformers import AutoProcessor, AutoTokenizer
 from vllm import LLM, SamplingParams
 from vllm.multimodal.utils import fetch_image
 from vllm.utils import FlexibleArgumentParser
 QUESTION = "What is the content of each image?"
 IMAGE_URLS = [
    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
 ]
 def load_phi3v(question, image_urls: List[str]):
    llm = LLM(
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": len(image_urls)},
    )
    placeholders = "\n".join(f"<|image_{i}|>"
                             for i, _ in enumerate(image_urls, start=1))
    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
    stop_token_ids = None
    return llm, prompt, stop_token_ids, None
 def load_internvl(question, image_urls: List[str]):
    model_name = "OpenGVLab/InternVL2-2B"
    llm = LLM(
        model=model_name,
        trust_remote_code=True,
        max_num_seqs=5,
        max_model_len=4096,
        limit_mm_per_prompt={"image": len(image_urls)},
    )
    placeholders = "\n".join(f"Image-{i}: <image>\n"
                             for i, _ in enumerate(image_urls, start=1))
    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(messages,
                                           tokenize=False,
                                           add_generation_prompt=True)
    # Stop tokens for InternVL
    # models variants may have different stop tokens
    # please refer to the model card for the correct "stop words":
    # https://huggingface.co/OpenGVLab/InternVL2-2B#service
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
    return llm, prompt, stop_token_ids, None
 def load_qwen2_vl(question, image_urls: List[str]):
    try:
        from qwen_vl_utils import process_vision_info
    except ModuleNotFoundError:
        print('WARNING: `qwen-vl-utils` not installed, input images will not '
              'be automatically resized. You can enable this functionality by '
              '`pip install qwen-vl-utils`.')
        process_vision_info = None
    model_name = "Qwen/Qwen2-VL-7B-Instruct"
    llm = LLM(
        model=model_name,
        max_num_seqs=5,
        max_model_len=32768 if process_vision_info is None else 4096,
        limit_mm_per_prompt={"image": len(image_urls)},
    )
    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [{
        "role": "system",
        "content": "You are a helpful assistant."
    }, {
        "role":
        "user",
        "content": [
            *placeholders,
            {
                "type": "text",
                "text": question
            },
        ],
    }]
    processor = AutoProcessor.from_pretrained(model_name)
    prompt = processor.apply_chat_template(messages,
                                           tokenize=False,
                                           add_generation_prompt=True)
    stop_token_ids = None
    if process_vision_info is None:
        image_data = [fetch_image(url) for url in image_urls]
    else:
        image_data, _ = process_vision_info(messages)
    return llm, prompt, stop_token_ids, image_data
 model_example_map = {
    "phi3_v": load_phi3v,
    "internvl_chat": load_internvl,
    "qwen2_vl": load_qwen2_vl,
 }
 def run_generate(model, question: str, image_urls: List[str]):
    llm, prompt, stop_token_ids, image_data = model_example_map[model](
        question, image_urls)
    if image_data is None:
        image_data = [fetch_image(url) for url in image_urls]
    sampling_params = SamplingParams(temperature=0.0,
                                     max_tokens=128,
                                     stop_token_ids=stop_token_ids)
    outputs = llm.generate(
        {
            "prompt": prompt,
            "multi_modal_data": {
                "image": image_data
            },
        },
        sampling_params=sampling_params)
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
 def run_chat(model: str, question: str, image_urls: List[str]):
    llm, _, stop_token_ids, _ = model_example_map[model](question, image_urls)
    sampling_params = SamplingParams(temperature=0.0,
                                     max_tokens=128,
                                     stop_token_ids=stop_token_ids)
    outputs = llm.chat([{
        "role":
        "user",
        "content": [
            {
                "type": "text",
                "text": question,
            },
            *({
                "type": "image_url",
                "image_url": {
                    "url": image_url
                },
            } for image_url in image_urls),
        ],
    }],
                       sampling_params=sampling_params)
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
 def main(args: Namespace):
    model = args.model_type
    method = args.method
    if method == "generate":
        run_generate(model, QUESTION, IMAGE_URLS)
    elif method == "chat":
        run_chat(model, QUESTION, IMAGE_URLS)
    else:
        raise ValueError(f"Invalid method: {method}")
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description='Demo on using vLLM for offline inference with '
        'vision language models that support multi-image input')
    parser.add_argument('--model-type',
                        '-m',
                        type=str,
                        default="phi3_v",
                        choices=model_example_map.keys(),
                        help='Huggingface "model_type".')
    parser.add_argument("--method",
                        type=str,
                        default="generate",
                        choices=["generate", "chat"],
                        help="The method to run in `vllm.LLM`.")
    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference_with_profiler.py
+++ b/examples/offline_inference_with_profiler.py
@@ -0,0 +1,33 @@
 import os
 from vllm import LLM, SamplingParams
 # enable torch profiler, can also be set on cmd line
 os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile"
 # Sample prompts.
 prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
 ]
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 # Create an LLM.
 llm = LLM(model="facebook/opt-125m")
 llm.start_profile()
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
 llm.stop_profile()
 # Print the outputs.
 for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/examples/openai_vision_api_client.py
+++ b/examples/openai_vision_api_client.py
@@ -27,9 +27,10 @@ client = OpenAI(
 models = client.models.list()
 model = models.data[0].id
 # Single-image input inference
 image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-# Use image url in the payload
+## Use image url in the payload
 chat_completion_from_url = client.chat.completions.create(
    messages=[{
        "role":
@@ -52,10 +53,10 @@ chat_completion_from_url = client.chat.completions.create(
 )
 result = chat_completion_from_url.choices[0].message.content
-print(f"Chat completion output:{result}")
+print("Chat completion output:", result)
-# Use base64 encoded image in the payload
+## Use base64 encoded image in the payload
 def encode_image_base64_from_url(image_url: str) -> str:
    """Encode an image retrieved from a remote url to base64 format."""
@@ -122,4 +123,4 @@ chat_completion_from_url = client.chat.completions.create(
 )
 result = chat_completion_from_url.choices[0].message.content
-print(f"Chat completion output:{result}")
+print("Chat completion output:", result)
--- a/examples/tool_chat_template_hermes.jinja
+++ b/examples/tool_chat_template_hermes.jinja
@@ -89,22 +89,23 @@
        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
    {%- elif message.role == "assistant" and message.tool_calls is defined %}
        {{- '<|im_start|>' + message.role }}
-            {%- for tool_call in message.tool_calls %}
+        {%- for tool_call in message.tool_calls %}
-                {{- '\n<tool_call>\n' }}
+            {{- '\n<tool_call>\n' }}
-                {%- if tool_call.function is defined %}
+            {%- if tool_call.function is defined %}
-                    {%- set tool_call = tool_call.function %}
+                {%- set tool_call = tool_call.function %}
-                {%- endif %}
+            {%- endif %}
-                {{- '{' }}
+            {{- '{' }}
-                {{- '"name": "' }}
+            {{- '"name": "' }}
-                {{- tool_call.name }}
+            {{- tool_call.name }}
-                {{- '"}' }}
+            {{- '"' }}
            {%- if tool_call.arguments is defined %}
                {{- ', ' }}
-                {%- if tool_call.arguments is defined %}
+                {{- '"arguments": ' }}
-                    {{- '"arguments": ' }}
+                {{- tool_call.arguments|tojson }}
-                    {{- tool_call.arguments|tojson }}
+            {%- endif %}
-                {%- endif %}
+            {{- '}' }}
-                {{- '\n</tool_call>' }}
+            {{- '\n</tool_call>' }}
-            {%- endfor %}
+        {%- endfor %}
        {{- '<|im_end|>\n' }}
    {%- elif message.role == "tool" %}
        {%- if loop.previtem and loop.previtem.role != "tool" %}
--- a/requirements-adag.txt
+++ b/requirements-adag.txt
@@ -1,3 +0,0 @@
 # Dependencies for Ray accelerated DAG
 cupy-cuda12x
 ray >= 2.32
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -9,7 +9,7 @@ tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
 fastapi
 aiohttp
-openai >= 1.0 # Ensure modern openai package (ensure types module present)
+openai >= 1.40.0 # Ensure modern openai package (ensure types module present)
 uvicorn[standard]
 pydantic >= 2.8  # Required for OpenAI server.
 pillow  # Required for image processing
@@ -25,5 +25,7 @@ pyzmq
 msgspec
 gguf == 0.9.1
 importlib_metadata
-mistral_common >= 1.3.4
+mistral_common >= 1.4.0
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 einops # Required for Qwen2-VL.
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,6 +1,3 @@
 # Needed for Ray accelerated DAG tests
 -r requirements-adag.txt
 # testing
 pytest
 tensorizer>=2.9.0
@@ -14,9 +11,10 @@ awscli
 einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio test
 opencv-python # required for video test
 peft
 requests
-ray
+ray[adag]>=2.35
 sentence-transformers # required for embedding
 soundfile # required for audio test
 compressed-tensors==0.4.0 # required for compressed-tensors
--- a/setup.py
+++ b/setup.py
@@ -170,14 +170,17 @@ class cmake_build_ext(build_ext):
        if is_sccache_available():
            cmake_args += [
                '-DCMAKE_C_COMPILER_LAUNCHER=sccache',
                '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
                '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
-                '-DCMAKE_C_COMPILER_LAUNCHER=sccache',
+                '-DCMAKE_HIP_COMPILER_LAUNCHER=sccache',
            ]
        elif is_ccache_available():
            cmake_args += [
                '-DCMAKE_C_COMPILER_LAUNCHER=ccache',
                '-DCMAKE_CXX_COMPILER_LAUNCHER=ccache',
                '-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache',
                '-DCMAKE_HIP_COMPILER_LAUNCHER=ccache',
            ]
        # Pass the python executable to cmake so it can find an exact
@@ -502,6 +505,7 @@ setup(
    ext_modules=ext_modules,
    extras_require={
        "tensorizer": ["tensorizer>=2.9.0"],
        "video": ["opencv-python"],  # Required for video processing
        "audio": ["librosa", "soundfile"]  # Required for audio processing
    },
    cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
--- a/tests/async_engine/test_chat_template.py
+++ b/tests/async_engine/test_chat_template.py
@@ -1,6 +1,7 @@
 import pytest
-from vllm.entrypoints.chat_utils import apply_chat_template, load_chat_template
+from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
                                         load_chat_template)
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.transformers_utils.tokenizer import get_tokenizer
@@ -87,7 +88,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
        add_generation_prompt=add_generation_prompt)
    # Call the function and get the result
-    result = apply_chat_template(
+    result = apply_hf_chat_template(
        tokenizer,
        conversation=mock_request.messages,
        chat_template=mock_request.chat_template or template_content,
--- a/tests/compile/test_wrapper.py
+++ b/tests/compile/test_wrapper.py
@@ -2,7 +2,7 @@ from typing import Optional
 import torch
-from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispacther
+from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
 class MyMod(torch.nn.Module):
@@ -13,7 +13,7 @@ class MyMod(torch.nn.Module):
        return x * 2
-class MyWrapper(TorchCompileWrapperWithCustomDispacther):
+class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
    def __init__(self, model):
        self.model = model
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -21,6 +21,7 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.config import TokenizerPoolConfig
 from vllm.connections import global_http_connection
 from vllm.distributed import (destroy_distributed_environment,
@@ -44,6 +45,7 @@ _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
 PromptImageInput = Union[List[Image.Image], List[List[Image.Image]]]
 PromptAudioInput = Union[List[Tuple[np.ndarray, int]],
                         List[List[Tuple[np.ndarray, int]]]]
 PromptVideoInput = Union[List[np.ndarray], List[List[np.ndarray]]]
 def _read_prompts(filename: str) -> List[str]:
@@ -85,8 +87,35 @@ class _ImageAssets(_ImageAssetsBase):
        return [prompts["stop_sign"], prompts["cherry_blossom"]]
 class _VideoAssetPrompts(TypedDict):
    sample_demo_1: str
 if sys.version_info < (3, 9):
    # UserList cannot be subscripted
    class _VideoAssetsBase(UserList):
        pass
 else:
    class _VideoAssetsBase(UserList[VideoAsset]):
        pass
 class _VideoAssets(_VideoAssetsBase):
    def __init__(self) -> None:
        super().__init__([
            VideoAsset("sample_demo_1.mp4"),
        ])
    def prompts(self, prompts: _VideoAssetPrompts) -> List[str]:
        return [prompts["sample_demo_1"]]
 IMAGE_ASSETS = _ImageAssets()
 """Singleton instance of :class:`_ImageAssets`."""
 VIDEO_ASSETS = _VideoAssets()
 """Singleton instance of :class:`_VideoAssets`."""
@pytest.fixture(autouse=True)
@@ -202,6 +231,11 @@ def image_assets() -> _ImageAssets:
    return IMAGE_ASSETS
@pytest.fixture(scope="session")
 def video_assets() -> _VideoAssets:
    return VIDEO_ASSETS
 _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature)
@@ -278,7 +312,8 @@ class HfRunner:
    def generate(
        self,
        prompts: List[str],
-        images: Optional[List[Image.Image]] = None,
+        images: Optional[PromptImageInput] = None,
        videos: Optional[List[np.ndarray]] = None,
        **kwargs: Any,
    ) -> List[Tuple[List[List[int]], List[str]]]:
        if images:
@@ -292,6 +327,8 @@ class HfRunner:
            }
            if images is not None and images[i] is not None:
                processor_kwargs["images"] = images[i]
            if videos is not None and videos[i] is not None:
                processor_kwargs["videos"] = videos[i]
            inputs = self.processor(**processor_kwargs)
            inputs = self.postprocess_inputs(inputs)
@@ -314,7 +351,7 @@ class HfRunner:
        self,
        prompts: List[str],
        max_tokens: int,
-        images: Optional[List[Image.Image]] = None,
+        images: Optional[PromptImageInput] = None,
        **kwargs: Any,
    ) -> List[Tuple[List[int], str]]:
        outputs = self.generate(prompts,
@@ -351,7 +388,8 @@ class HfRunner:
        self,
        prompts: List[str],
        max_tokens: int,
-        images: Optional[List[Image.Image]] = None,
+        images: Optional[PromptImageInput] = None,
        videos: Optional[List[np.ndarray]] = None,
        **kwargs: Any,
    ) -> List[List[torch.Tensor]]:
        all_logprobs: List[List[torch.Tensor]] = []
@@ -362,6 +400,8 @@ class HfRunner:
            }
            if images is not None and images[i] is not None:
                processor_kwargs["images"] = images[i]
            if videos is not None and videos[i] is not None:
                processor_kwargs["videos"] = videos[i]
            inputs = self.processor(**processor_kwargs)
            inputs = self.postprocess_inputs(inputs)
@@ -433,8 +473,9 @@ class HfRunner:
        prompts: List[str],
        max_tokens: int,
        num_logprobs: int,
-        images: Optional[List[Image.Image]] = None,
+        images: Optional[PromptImageInput] = None,
-        audios: Optional[List[Tuple[np.ndarray, int]]] = None,
+        audios: Optional[PromptAudioInput] = None,
        videos: Optional[List[np.ndarray]] = None,
        **kwargs: Any,
    ) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
        all_logprobs: List[List[Dict[int, float]]] = []
@@ -454,6 +495,8 @@ class HfRunner:
                processor_kwargs["audio"] = audio
                processor_kwargs["sampling_rate"] = sr
            if videos is not None:
                processor_kwargs["videos"] = videos[i]
            inputs = self.processor(**processor_kwargs)
            inputs = self.postprocess_inputs(inputs)
@@ -634,12 +677,16 @@ class VllmRunner:
        sampling_params: SamplingParams,
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
        videos: Optional[PromptVideoInput] = None,
    ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
        assert sampling_params.logprobs is not None
        if images is not None:
            assert len(prompts) == len(images)
        if videos is not None:
            assert len(prompts) == len(videos)
        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
        if images is not None:
            for i, image in enumerate(images):
@@ -649,6 +696,11 @@ class VllmRunner:
            for i, audio in enumerate(audios):
                inputs[i]["multi_modal_data"] = {"audio": audio}
        if videos is not None:
            for i, video in enumerate(videos):
                inputs[i]["multi_modal_data"] = {"video": video}
        print(f"[INPUTS!!!!]: {inputs}, {sampling_params}")
        req_outputs = self.model.generate(inputs,
                                          sampling_params=sampling_params)
        return self._final_steps_generate_w_logprobs(req_outputs)
@@ -671,7 +723,7 @@ class VllmRunner:
        self,
        prompts: List[str],
        max_tokens: int,
-        images: Optional[List[Image.Image]] = None,
+        images: Optional[PromptImageInput] = None,
    ) -> List[Tuple[List[int], str]]:
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
        outputs = self.generate(prompts, greedy_params, images=images)
@@ -685,6 +737,7 @@ class VllmRunner:
        num_logprobs: int,
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
        videos: Optional[PromptVideoInput] = None,
        stop_token_ids: Optional[List[int]] = None,
    ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
        greedy_logprobs_params = SamplingParams(temperature=0.0,
@@ -694,7 +747,8 @@ class VllmRunner:
        outputs = self.generate_w_logprobs(prompts,
                                           greedy_logprobs_params,
                                           images=images,
-                                           audios=audios)
+                                           audios=audios,
                                           videos=videos)
        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]
--- a/tests/distributed/test_multimodal_broadcast.py
+++ b/tests/distributed/test_multimodal_broadcast.py
@@ -35,9 +35,11 @@ def test_models(hf_runner, vllm_runner, image_assets, model: str,
    if model.startswith("llava-hf/llava-1.5"):
        from ..models.test_llava import models, run_test
    elif model.startswith("llava-hf/llava-v1.6"):
-        from ..models.test_llava_next import models, run_test
+        from ..models.test_llava_next import run_test  # type: ignore[no-redef]
        from ..models.test_llava_next import models
    elif model.startswith("facebook/chameleon"):
-        from ..models.test_chameleon import models, run_test
+        from ..models.test_chameleon import run_test  # type: ignore[no-redef]
        from ..models.test_chameleon import models
    else:
        raise NotImplementedError(f"Unsupported model: {model}")
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -18,23 +18,28 @@ logger = init_logger("test_pipeline_parallel")
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
-@pytest.mark.parametrize(("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, "
+@pytest.mark.parametrize(
-                          "MODEL_NAME, DIST_BACKEND"),
+    ("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, TRUST_REMOTE_CODE, "
-                         [
+     "MODEL_NAME, DIST_BACKEND"),
-                             (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
+    [
-                             (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+        (2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-                             (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+        (2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-                             (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
+        (1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-                             (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+        (1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-                             (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+        (1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-                             (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
+        (1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-                             (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+        (1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-                             (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+        (1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-                             (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
+        (2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-                         ])
+        (2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "ray"),
        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "ray"),
        (1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "ray"),
    ],
 )
@fork_new_process_for_each_test
-def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
+def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
-                    DIST_BACKEND):
+                    TRUST_REMOTE_CODE, MODEL_NAME, DIST_BACKEND):
    if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
        pytest.skip("Skipping multi-node pipeline parallel test for "
                    "multiprocessing distributed backend")
@@ -43,6 +48,8 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "float16",
        "--max-model-len",
        "8192",
        "--pipeline-parallel-size",
        str(PP_SIZE),
        "--tensor-parallel-size",
@@ -59,7 +66,9 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
    tp_args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
-        "bfloat16",
+        "float16",
        "--max-model-len",
        "8192",
        "--tensor-parallel-size",
        str(max(TP_SIZE, 2)),  # We only use 2 GPUs in the CI.
        "--distributed-executor-backend",
@@ -71,6 +80,9 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
    if EAGER_MODE:
        pp_args.append("--enforce-eager")
        tp_args.append("--enforce-eager")
    if TRUST_REMOTE_CODE:
        pp_args.append("--trust-remote-code")
        tp_args.append("--trust-remote-code")
    pp_env = None
    if (DIST_BACKEND == "ray" and TP_SIZE == 2 and PP_SIZE == 2
            and CHUNKED_PREFILL):
--- a/tests/engine/test_multiproc_workers.py
+++ b/tests/engine/test_multiproc_workers.py
@@ -83,7 +83,7 @@ def test_local_workers() -> None:
    workers[3].process.kill()
    # Other workers should get shut down here
-    worker_monitor.join(2)
+    worker_monitor.join(20)
    # Ensure everything is stopped
    assert not worker_monitor.is_alive()
@@ -108,7 +108,7 @@ def test_local_workers_clean_shutdown() -> None:
    # Clean shutdown
    worker_monitor.close()
-    worker_monitor.join(5)
+    worker_monitor.join(20)
    # Ensure everything is stopped
    assert not worker_monitor.is_alive()
@@ -161,7 +161,7 @@ async def test_local_workers_async() -> None:
    workers[3].process.kill()
    # Other workers should get shut down here
-    worker_monitor.join(2)
+    worker_monitor.join(20)
    # Ensure everything is stopped
    assert not worker_monitor.is_alive()
--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -50,7 +50,7 @@ def zephyr_lora_files():
@pytest.mark.skip_global_cleanup
 def test_multiple_lora_requests(llm: LLM, zephyr_lora_files):
    lora_request = [
-        LoRARequest(LORA_NAME, idx + 1, zephyr_lora_files)
+        LoRARequest(LORA_NAME + str(idx), idx + 1, zephyr_lora_files)
        for idx in range(len(PROMPTS))
    ]
    # Multiple SamplingParams should be matched with each prompt
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -8,7 +8,9 @@ from vllm.entrypoints.openai.protocol import BatchRequestOutput
 INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
-{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
+{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
 {"custom_id": "request-4", "method": "POST", "url": "/bad_url", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
 {"custom_id": "request-5", "method": "POST", "url": "/v1/chat/completions", "body": {"stream": "True", "model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
 INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -0,0 +1,107 @@
 from http import HTTPStatus
 from unittest.mock import MagicMock
 import pytest
 from vllm.config import ModelConfig
 from vllm.engine.protocol import AsyncEngineClient
 from vllm.entrypoints.openai.protocol import (ErrorResponse,
                                              LoadLoraAdapterRequest,
                                              UnloadLoraAdapterRequest)
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 MODEL_NAME = "meta-llama/Llama-2-7b"
 LORA_LOADING_SUCCESS_MESSAGE = (
    "Success: LoRA adapter '{lora_name}' added successfully.")
 LORA_UNLOADING_SUCCESS_MESSAGE = (
    "Success: LoRA adapter '{lora_name}' removed successfully.")
 async def _async_serving_engine_init():
    mock_engine_client = MagicMock(spec=AsyncEngineClient)
    mock_model_config = MagicMock(spec=ModelConfig)
    # Set the max_model_len attribute to avoid missing attribute
    mock_model_config.max_model_len = 2048
    serving_engine = OpenAIServing(mock_engine_client,
                                   mock_model_config,
                                   served_model_names=[MODEL_NAME],
                                   lora_modules=None,
                                   prompt_adapters=None,
                                   request_logger=None)
    return serving_engine
@pytest.mark.asyncio
 async def test_load_lora_adapter_success():
    serving_engine = await _async_serving_engine_init()
    request = LoadLoraAdapterRequest(lora_name="adapter",
                                     lora_path="/path/to/adapter2")
    response = await serving_engine.load_lora_adapter(request)
    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
    assert len(serving_engine.lora_requests) == 1
    assert serving_engine.lora_requests[0].lora_name == "adapter"
@pytest.mark.asyncio
 async def test_load_lora_adapter_missing_fields():
    serving_engine = await _async_serving_engine_init()
    request = LoadLoraAdapterRequest(lora_name="", lora_path="")
    response = await serving_engine.load_lora_adapter(request)
    assert isinstance(response, ErrorResponse)
    assert response.type == "InvalidUserInput"
    assert response.code == HTTPStatus.BAD_REQUEST
@pytest.mark.asyncio
 async def test_load_lora_adapter_duplicate():
    serving_engine = await _async_serving_engine_init()
    request = LoadLoraAdapterRequest(lora_name="adapter1",
                                     lora_path="/path/to/adapter1")
    response = await serving_engine.load_lora_adapter(request)
    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(
        lora_name='adapter1')
    assert len(serving_engine.lora_requests) == 1
    request = LoadLoraAdapterRequest(lora_name="adapter1",
                                     lora_path="/path/to/adapter1")
    response = await serving_engine.load_lora_adapter(request)
    assert isinstance(response, ErrorResponse)
    assert response.type == "InvalidUserInput"
    assert response.code == HTTPStatus.BAD_REQUEST
    assert len(serving_engine.lora_requests) == 1
@pytest.mark.asyncio
 async def test_unload_lora_adapter_success():
    serving_engine = await _async_serving_engine_init()
    request = LoadLoraAdapterRequest(lora_name="adapter1",
                                     lora_path="/path/to/adapter1")
    response = await serving_engine.load_lora_adapter(request)
    assert len(serving_engine.lora_requests) == 1
    request = UnloadLoraAdapterRequest(lora_name="adapter1")
    response = await serving_engine.unload_lora_adapter(request)
    assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(
        lora_name='adapter1')
    assert len(serving_engine.lora_requests) == 0
@pytest.mark.asyncio
 async def test_unload_lora_adapter_missing_fields():
    serving_engine = await _async_serving_engine_init()
    request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None)
    response = await serving_engine.unload_lora_adapter(request)
    assert isinstance(response, ErrorResponse)
    assert response.type == "InvalidUserInput"
    assert response.code == HTTPStatus.BAD_REQUEST
@pytest.mark.asyncio
 async def test_unload_lora_adapter_not_found():
    serving_engine = await _async_serving_engine_init()
    request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter")
    response = await serving_engine.unload_lora_adapter(request)
    assert isinstance(response, ErrorResponse)
    assert response.type == "InvalidUserInput"
    assert response.code == HTTPStatus.BAD_REQUEST
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -3,8 +3,10 @@ from typing import Type
 import pytest
 import torch
 from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
-                                                   NewGELU, SiluAndMul)
+                                                   NewGELU, QuickGELU,
                                                   SiluAndMul)
 from .allclose_default import get_default_atol, get_default_rtol
@@ -39,18 +41,28 @@ def test_act_and_mul(
    x = torch.randn(num_tokens, 2 * d, dtype=dtype)
    if activation == "silu":
        layer = SiluAndMul()
        fn = torch.ops._C.silu_and_mul
    elif activation == "gelu":
        layer = GeluAndMul(approximate="none")
        fn = torch.ops._C.gelu_and_mul
    elif activation == "gelu_tanh":
        layer = GeluAndMul(approximate="tanh")
        fn = torch.ops._C.gelu_tanh_and_mul
    out = layer(x)
    ref_out = layer.forward_native(x)
    # The SiLU and GELU implementations are equivalent to the native PyTorch
    # implementations, so we can do exact comparison.
    torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
    d = x.shape[-1] // 2
    output_shape = (x.shape[:-1] + (d, ))
    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
    opcheck(fn, (out, x))
-@pytest.mark.parametrize("activation", [FastGELU, NewGELU])
+
@pytest.mark.parametrize("activation", [(FastGELU, torch.ops._C.gelu_fast),
                                        (NewGELU, torch.ops._C.gelu_new),
                                        (QuickGELU, torch.ops._C.gelu_quick)])
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("d", D)
@pytest.mark.parametrize("dtype", DTYPES)
@@ -70,10 +82,14 @@ def test_activation(
        torch.cuda.manual_seed(seed)
    torch.set_default_device(device)
    x = torch.randn(num_tokens, d, dtype=dtype)
-    layer = activation()
+    layer = activation[0]()
    fn = activation[1]
    out = layer(x)
    ref_out = layer.forward_native(x)
    torch.testing.assert_close(out,
                               ref_out,
                               atol=get_default_atol(out),
                               rtol=get_default_rtol(out))
    out = torch.empty_like(x)
    opcheck(fn, (out, x))
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -6,6 +6,7 @@ import torch
 from xformers import ops as xops
 from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
 from vllm.utils import get_max_shared_memory_bytes, is_hip
@@ -198,6 +199,13 @@ def test_paged_attention(
            k_scale,
            v_scale,
        )
        opcheck(torch.ops._C.paged_attention_v1,
                (output, query, key_cache, value_cache, num_kv_heads, scale,
                 block_tables, seq_lens, block_size, max_seq_len, alibi_slopes,
                 kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0),
                cond=(head_size == HEAD_SIZES[0]))
    elif version == "v2":
        num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
        assert PARTITION_SIZE % block_size == 0
@@ -230,6 +238,14 @@ def test_paged_attention(
            k_scale,
            v_scale,
        )
        opcheck(torch.ops._C.paged_attention_v2,
                (output, exp_sums, max_logits, tmp_output, query, key_cache,
                 value_cache, num_kv_heads, scale, block_tables, seq_lens,
                 block_size, max_seq_len, alibi_slopes, kv_cache_dtype,
                 k_scale, v_scale, 0, 0, 0, 64, 0),
                cond=(head_size == HEAD_SIZES[0]))
    else:
        raise AssertionError(f"Unknown version: {version}")
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -4,6 +4,7 @@ from typing import List, Tuple
 import pytest
 import torch
 from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
 from vllm import _custom_ops as ops
 COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
@@ -87,6 +88,11 @@ def test_copy_blocks(
    block_mapping_tensor = torch.tensor(block_mapping,
                                        dtype=torch.int64,
                                        device=device).view(-1, 2)
    opcheck(torch.ops._C_cache_ops.copy_blocks,
            (key_caches, value_caches, block_mapping_tensor),
            test_utils=DEFAULT_OPCHECK_TEST_UTILS,
            cond=(head_size == HEAD_SIZES[0]))
    ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
    # Run the reference implementation.
@@ -162,6 +168,10 @@ def test_reshape_and_cache(
    k_scale = v_scale = 1.0
    # Call the reshape_and_cache kernel.
    opcheck(torch.ops._C_cache_ops.reshape_and_cache,
            (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
             k_scale, v_scale),
            cond=(head_size == HEAD_SIZES[0]))
    ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping,
                          kv_cache_dtype, k_scale, v_scale)
@@ -269,6 +279,10 @@ def test_reshape_and_cache_flash(
    k_scale = v_scale = 1.0
    # Call the reshape_and_cache kernel.
    opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash,
            (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
             k_scale, v_scale),
            cond=(head_size == HEAD_SIZES[0]))
    ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
                                slot_mapping, kv_cache_dtype, k_scale, v_scale)
@@ -366,6 +380,14 @@ def test_swap_blocks(
    src_value_caches_clone = src_value_caches[0].clone()
    # Call the swap_blocks kernel.
    do_opcheck = (head_size == HEAD_SIZES[0])
    opcheck(torch.ops._C_cache_ops.swap_blocks,
            (src_key_caches[0], dist_key_caches[0], block_mapping_tensor),
            cond=do_opcheck)
    opcheck(torch.ops._C_cache_ops.swap_blocks,
            (src_value_caches[0], dist_value_caches[0], block_mapping_tensor),
            cond=do_opcheck)
    ops.swap_blocks(src_key_caches[0], dist_key_caches[0],
                    block_mapping_tensor)
    ops.swap_blocks(src_value_caches[0], dist_value_caches[0],
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -7,6 +7,7 @@ from typing import Optional, Type
 import pytest
 import torch
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
@@ -108,6 +109,9 @@ def cutlass_int8_gemm_helper(m: int,
    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
    opcheck(torch.ops._C.cutlass_scaled_mm,
            (out, a, b, scale_a, scale_b, bias))
@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 100, 33])
@pytest.mark.parametrize("n", [2048, 4096, 8192, 16384, 24576, 256, 1024])
@@ -341,6 +345,15 @@ def test_cutlass_int8_azp(m: int, n: int, k: int, out_dtype: torch.dtype,
    torch.testing.assert_close(out, baseline_dq, rtol=rtol, atol=atol)
    torch.testing.assert_close(out, baseline_q, rtol=rtol, atol=atol)
    if azp_per_token:
        opcheck(torch.ops._C.cutlass_scaled_mm_azp,
                (out, aq_i8, bq_i8, scale_a, scale_b, azp_adj_i32, azp_i32,
                 func_bias))
    else:
        opcheck(torch.ops._C.cutlass_scaled_mm_azp,
                (out, aq_i8, bq_i8, scale_a, scale_b, azp_with_adj_i32, None,
                 func_bias))
 # Test working with a subset of A and B
 def test_cutlass_subset():
--- a/tests/kernels/test_flashinfer.py
+++ b/tests/kernels/test_flashinfer.py
@@ -445,7 +445,8 @@ def test_flashinfer_decode_with_paged_fp8_kv(
                          head_size,
                          block_size,
                          "NONE",
-                          data_type=dtype)
+                          data_type=dtype,
                          q_data_type=dtype)
    output = wrapper.forward(query,
                             kv_cache_fp8,
                             logits_soft_cap=soft_cap,
--- a/tests/kernels/test_int8_quant.py
+++ b/tests/kernels/test_int8_quant.py
@@ -2,6 +2,7 @@ import pytest
 import torch
 from tests.kernels.quant_utils import ref_dynamic_per_token_quant
 from tests.kernels.utils import opcheck
 from vllm._custom_ops import scaled_int8_quant
 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -12,6 +13,16 @@ SEEDS = [0]
 SCALE = [0.1, 0.5, 0.8, 1.2, 2.1]
 def opcheck_int8_quant(output, input, scale=None):
    if scale is not None:
        opcheck(torch.ops._C.static_scaled_int8_quant, (output, input, scale))
    else:
        scale = torch.empty((input.numel() // input.shape[-1], 1),
                            device=input.device,
                            dtype=torch.float32)
        opcheck(torch.ops._C.dynamic_scaled_int8_quant, (output, input, scale))
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
@@ -34,6 +45,8 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
        ops_out, ref_out, atol=1,
        rtol=0.0)  # big atol to account for rounding errors
    opcheck_int8_quant(ops_out, x)
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@@ -58,3 +71,5 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
    torch.testing.assert_close(
        out1, out2, atol=1,
        rtol=0.0)  # big atol to account for rounding errors
    opcheck_int8_quant(out2, x, scale)
--- a/tests/kernels/test_layernorm.py
+++ b/tests/kernels/test_layernorm.py
@@ -1,6 +1,7 @@
 import pytest
 import torch
 from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.layernorm import RMSNorm
 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -52,3 +53,10 @@ def test_rms_norm(
        torch.testing.assert_close(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
    else:
        torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
    if residual is not None:
        opcheck(torch.ops._C.fused_add_rms_norm,
                (x, residual, layer.weight.data, layer.variance_epsilon))
    else:
        opcheck(torch.ops._C.rms_norm,
                (out, x, layer.weight.data, layer.variance_epsilon))
--- a/tests/kernels/test_machete_gemm.py
+++ b/tests/kernels/test_machete_gemm.py
@@ -9,6 +9,7 @@ from typing import Optional, Tuple
 import pytest
 import torch
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    pack_rows, quantize_weights)
@@ -76,6 +77,8 @@ def machete_quantize_and_pack(w: torch.Tensor,
    w_q = w_q.t().contiguous().t()  # convert to col major
    w_q_machete = ops.machete_prepack_B(w_q, wtype)
    opcheck(torch.ops._C.machete_prepack_B, (w_q, wtype))
    return w_ref, w_q_machete, w_s, w_zp
@@ -146,6 +149,10 @@ def test_machete_all_schedules(shape, atype: torch.dtype,
            schedule=schedule,
        )
        opcheck(torch.ops._C.machete_gemm,
                (a, w_q_machete, wtype, w_s, maybe_convert_zeropoints(
                    w_zp, w_s), group_size, None, None, None, schedule))
        # Relax atol as our reduction dim becomes larger (more rounding error)
        # Relax atol when we have zeropoints since the way machete applies
        #  zeropoints (after scales) causes noise around 0
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -5,6 +5,7 @@ Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.
 import pytest
 import torch
 from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
 from tests.quantization.utils import is_quant_method_supported
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
@@ -73,12 +74,9 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
                            act_order, mnk_factors):
    m_factor, n_factor, k_factor = mnk_factors
    size_m = m_factor
    size_k = k_chunk * k_factor
    size_n = n_chunk * n_factor
    print(f"MNK = {size_m} {size_n} {size_k}")
    # Filter act_order
    if act_order:
        if group_size == -1:
@@ -112,6 +110,9 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
                                  weight_perm)
    opcheck(torch.ops._C.gptq_marlin_repack,
            (q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits))
    # Run Marlin repack GPU kernel
    marlin_q_w_2 = ops.gptq_marlin_repack(
        q_w_gptq,
@@ -137,12 +138,9 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
                           mnk_factors):
    m_factor, n_factor, k_factor = mnk_factors
    size_m = m_factor
    size_k = k_chunk * k_factor
    size_n = n_chunk * n_factor
    print(f"MNK = {size_m} {size_n} {size_k}")
    # Normalize group_size
    if group_size == -1:
        group_size = size_k
@@ -165,6 +163,9 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
                                  weight_perm)
    opcheck(torch.ops._C.awq_marlin_repack,
            (q_w_awq, size_k, size_n, quant_type.size_bits))
    # Run Marlin repack GPU kernel
    marlin_q_w_2 = ops.awq_marlin_repack(
        q_w_awq,
@@ -204,9 +205,6 @@ def test_gptq_marlin_gemm(
    size_k = k_chunk * k_factor
    size_n = n_chunk * n_factor
    print(f"MNK = {size_m} {size_n} {size_k}")
    print(f"groupsize = {group_size}")
    if act_order:
        if group_size == -1:
            return
@@ -224,6 +222,13 @@ def test_gptq_marlin_gemm(
    workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
                                GPTQ_MARLIN_MAX_PARALLEL)
    opcheck(
        torch.ops._C.gptq_marlin_gemm,
        (a_input, marlin_q_w, marlin_s, marlin_zp, g_idx, sort_indices,
         workspace.scratch, quant_type, a_input.shape[0], b_weight.shape[1],
         a_input.shape[1], is_k_full, False, use_fp32_reduce),
        test_utils=DEFAULT_OPCHECK_TEST_UTILS)
    output = ops.gptq_marlin_gemm(
        a_input,
        marlin_q_w,
@@ -245,7 +250,6 @@ def test_gptq_marlin_gemm(
    torch.cuda.synchronize()
    max_diff = compute_max_diff(output, output_ref)
    print("max_diff = {}".format(max_diff))
    assert max_diff < 0.04
@@ -265,9 +269,6 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
    size_k = k_chunk * k_factor
    size_n = n_chunk * n_factor
    print(f"MNK = {size_m} {size_n} {size_k}")
    print(f"groupsize = {group_size}")
    a_input = rand_data((size_m, size_k))
    b_weight = rand_data((size_k, size_n))
@@ -279,6 +280,12 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
    output_ref = torch.matmul(a_input, w_24_ref)
    opcheck(torch.ops._C.gptq_marlin_24_gemm,
            (a_input, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s,
             workspace_24.scratch, quant_type, a_input.shape[0],
             b_weight.shape[1], a_input.shape[1]),
            test_utils=DEFAULT_OPCHECK_TEST_UTILS)
    output = ops.gptq_marlin_24_gemm(
        a_input,
        marlin_24_q_w_comp,
@@ -294,7 +301,6 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
    torch.cuda.synchronize()
    max_diff = compute_max_diff(output, output_ref)
    print("max_diff = {}".format(max_diff))
    assert max_diff < 0.04
@@ -321,9 +327,6 @@ def test_fp8_marlin_gemm(
    size_k = k_chunk * k_factor
    size_n = n_chunk * n_factor
    print(f"MNK = {size_m} {size_n} {size_k}")
    print(f"groupsize = {group_size}")
    a_input = rand_data((size_m, size_k), dtype=dtype)
    b_weight = rand_data((size_k, size_n), dtype=dtype)
@@ -353,6 +356,10 @@ def test_fp8_marlin_gemm(
    workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
                                GPTQ_MARLIN_MAX_PARALLEL)
    opcheck(torch.ops._C.fp8_marlin_gemm,
            (a_input, marlin_qweight, marlin_scales, workspace.scratch,
             num_bits, a_input.shape[0], b_weight.shape[1], a_input.shape[1]))
    output = ops.fp8_marlin_gemm(
        a=a_input,
        b_q_weight=marlin_qweight,
@@ -368,7 +375,6 @@ def test_fp8_marlin_gemm(
    torch.cuda.synchronize()
    max_diff = compute_max_diff(output, output_ref)
    print("max_diff = {}".format(max_diff))
    assert max_diff < 0.04
@@ -396,9 +402,6 @@ def test_awq_marlin_gemm(
    size_k = k_chunk * k_factor
    size_n = n_chunk * n_factor
    print(f"MNK = {size_m} {size_n} {size_k}")
    print(f"groupsize = {group_size}")
    a_input = rand_data((size_m, size_k))
    b_weight = rand_data((size_k, size_n))
@@ -434,7 +437,6 @@ def test_awq_marlin_gemm(
    torch.cuda.synchronize()
    max_diff = compute_max_diff(output, output_ref)
    print("max_diff = {}".format(max_diff))
    assert max_diff < 0.04
@@ -460,9 +462,6 @@ def test_marlin_qqq_gemm(
    size_k = k_chunk * k_factor
    size_n = n_chunk * n_factor
    print(f"MNK = {size_m} {size_n} {size_k}")
    print(f"groupsize = {group_size}")
    a_input = rand_data((size_m, size_k))
    b_weight = rand_data((size_k, size_n))
@@ -479,6 +478,11 @@ def test_marlin_qqq_gemm(
    workspace = MarlinWorkspace(size_n, MARLIN_QQQ_MIN_THREAD_N,
                                MARLIN_QQQ_MAX_PARALLEL)
    opcheck(torch.ops._C.marlin_qqq_gemm,
            (q_a, marlin_qqq_q_w, s_a, marlin_qqq_s_channel,
             marlin_qqq_s_group, workspace.scratch, a_input.shape[0],
             b_weight.shape[1], a_input.shape[1]))
    output = ops.marlin_qqq_gemm(
        q_a,
        marlin_qqq_q_w,
@@ -495,6 +499,5 @@ def test_marlin_qqq_gemm(
    torch.cuda.synchronize()
    max_diff = compute_max_diff(output, output_ref)
    print("max_diff = {}".format(max_diff))
    assert max_diff < 0.04
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -2,6 +2,8 @@
 Run `pytest tests/kernels/test_moe.py`.
 """
 from typing import List
 import pytest
 import torch
 from transformers import MixtralConfig
@@ -9,7 +11,13 @@ from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
    fused_marlin_moe, single_marlin_moe)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
    marlin_quantize)
 from vllm.model_executor.models.mixtral import MixtralMoE
 from vllm.scalar_type import scalar_types
 def torch_moe(a, w1, w2, score, topk):
@@ -29,6 +37,20 @@ def torch_moe(a, w1, w2, score, topk):
            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
 def torch_moe_single(a, w, score, topk):
    B, D = a.shape
    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
    out = torch.zeros(B * topk, w.shape[1], dtype=a.dtype, device=a.device)
    score = torch.softmax(score, dim=-1, dtype=torch.float32)
    _, topk_ids = torch.topk(score, topk)
    topk_ids = topk_ids.view(-1)
    for i in range(w.shape[0]):
        mask = topk_ids == i
        if mask.sum():
            out[mask] = a[mask] @ w[i].transpose(0, 1)
    return (out.view(B, -1, w.shape[1])).sum(dim=1)
@pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1])
@pytest.mark.parametrize("n", [2048, 256, 1024])
@pytest.mark.parametrize("k", [128, 511, 1024])
@@ -43,11 +65,11 @@ def test_fused_moe(
    topk: int,
    dtype: torch.dtype,
 ):
-    a = torch.randn((m, k), device='cuda', dtype=dtype) / 10
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
-    w1 = torch.randn((e, 2 * n, k), device='cuda', dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
-    w2 = torch.randn((e, k, n), device='cuda', dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
-    score = torch.randn((m, e), device='cuda', dtype=dtype)
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
    triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False)
    torch_output = torch_moe(a, w1, w2, score, topk)
    torch.testing.assert_close(triton_output, torch_output, atol=1e-2, rtol=0)
@@ -99,3 +121,194 @@ def test_mixtral_moe(dtype: torch.dtype):
                               vllm_states,
                               rtol=mixtral_moe_tol[dtype],
                               atol=mixtral_moe_tol[dtype])
 def stack_and_dev(tensors: List[torch.Tensor]):
    dev = tensors[0].device
    return torch.stack(tensors, dim=0).to(dev)
 def compute_max_diff(output, output_ref):
    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
        torch.abs(output_ref))
@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
@pytest.mark.parametrize("k", [128, 1024, 512])
@pytest.mark.parametrize("e", [4, 8, 64])
@pytest.mark.parametrize("topk", [2, 6])
@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
@pytest.mark.parametrize("act_order", [True, False])
 def test_fused_marlin_moe(
    m: int,
    n: int,
    k: int,
    e: int,
    topk: int,
    group_size: int,
    act_order: bool,
 ):
    torch.manual_seed(7)
    if topk > e:
        return
    # Filter act_order
    if act_order:
        if group_size == -1:
            return
        if group_size in (k, n):
            return
    quant_type = scalar_types.uint4b8
    dtype = torch.float16
    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
    for i in range(w2.shape[0]):
        w2[0] = torch.eye(k, n, device="cuda", dtype=dtype)
    w_ref1_l = []
    qweight1_l = []
    scales1_l = []
    g_idx1_l = []
    sort_indices1_l = []
    for i in range(w1.shape[0]):
        test_perm = torch.randperm(k)
        w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = marlin_quantize(
            w1[i].transpose(1, 0), quant_type, group_size, act_order,
            test_perm)
        w_ref1_l.append(w_ref1)
        qweight1_l.append(qweight1)
        scales1_l.append(scales1)
        g_idx1_l.append(g_idx1)
        sort_indices1_l.append(sort_indices1)
    w_ref1 = stack_and_dev(w_ref1_l)
    qweight1 = stack_and_dev(qweight1_l).contiguous()
    scales1 = stack_and_dev(scales1_l)
    g_idx1 = stack_and_dev(g_idx1_l)
    sort_indices1 = stack_and_dev(sort_indices1_l)
    w_ref2_l = []
    qweight2_l = []
    scales2_l = []
    g_idx2_l = []
    sort_indices2_l = []
    for i in range(w2.shape[0]):
        test_perm = torch.randperm(n)
        w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = marlin_quantize(
            w2[i].transpose(1, 0), quant_type, group_size, act_order,
            test_perm)
        w_ref2_l.append(w_ref2)
        qweight2_l.append(qweight2)
        scales2_l.append(scales2)
        g_idx2_l.append(g_idx2)
        sort_indices2_l.append(sort_indices2)
    w_ref2 = stack_and_dev(w_ref2_l)
    qweight2 = stack_and_dev(qweight2_l).contiguous()
    scales2 = stack_and_dev(scales2_l)
    g_idx2 = stack_and_dev(g_idx2_l)
    sort_indices2 = stack_and_dev(sort_indices2_l)
    score = torch.randn((m, e), device="cuda", dtype=dtype)
    topk_weights, topk_ids = fused_topk(a, score, topk, False)
    triton_output = fused_moe(
        a,
        w_ref1.transpose(1, 2).contiguous(),
        w_ref2.transpose(1, 2).contiguous(),
        score,
        topk,
        renormalize=False,
    )
    marlin_output = fused_marlin_moe(
        a,
        qweight1,
        qweight2,
        score,
        g_idx1,
        g_idx2,
        sort_indices1,
        sort_indices2,
        topk_weights,
        topk_ids,
        w1_scale=scales1,
        w2_scale=scales2,
    )
    assert compute_max_diff(marlin_output, triton_output) < 4e-2
@pytest.mark.skip("This test is here for the sake of debugging, "
                  "don't run it in automated tests.")
@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
@pytest.mark.parametrize("k", [128, 1024, 512])
@pytest.mark.parametrize("e", [4, 8, 64])
@pytest.mark.parametrize("topk", [2, 6])
@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
@pytest.mark.parametrize("act_order", [True, False])
 def test_marlin_moe_mmm(
    m: int,
    n: int,
    k: int,
    e: int,
    topk: int,
    group_size: int,
    act_order: bool,
 ):
    if topk > e:
        return
    # Filter act_order
    if act_order:
        if group_size == -1:
            return
        if group_size == k:
            return
    quant_type = scalar_types.uint4b8
    dtype = torch.float16
    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
    w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10
    w_ref_l = []
    qweights_l = []
    scales_l = []
    g_idx_l = []
    sort_indices_l = []
    for i in range(w.shape[0]):
        test_perm = torch.randperm(k)
        w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize(
            w[i].transpose(1, 0), quant_type, group_size, act_order, test_perm)
        w_ref_l.append(w_ref)
        qweights_l.append(qweight)
        scales_l.append(scales)
        g_idx_l.append(g_idx)
        sort_indices_l.append(sort_indices)
    w_ref = stack_and_dev(w_ref_l)
    qweight = stack_and_dev(qweights_l).contiguous()
    scales = stack_and_dev(scales_l)
    g_idx = stack_and_dev(g_idx_l)
    sort_indices = stack_and_dev(sort_indices_l)
    score = torch.randn((m, e), device="cuda", dtype=dtype)
    marlin_output = single_marlin_moe(a,
                                      qweight,
                                      scales,
                                      score,
                                      g_idx,
                                      sort_indices,
                                      topk,
                                      renormalize=False)
    torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
    assert compute_max_diff(marlin_output, torch_output) < 1e-2
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -3,7 +3,8 @@
 import itertools
 import random
 from numbers import Number
-from typing import Any, List, NamedTuple, Optional, Tuple, Union
+from typing import (Any, Dict, List, NamedTuple, Optional, Sequence, Tuple,
                    Union)
 import pytest
 import torch
@@ -13,6 +14,21 @@ from vllm.attention.backends.xformers import XFormersBackend
 from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL,
                        make_tensor_with_pad)
 # For now, disable "test_aot_dispatch_dynamic" since there are some
 # bugs related to this test in PyTorch 2.4.
 DEFAULT_OPCHECK_TEST_UTILS: Tuple[str, ...] = (
    "test_schema",
    "test_autograd_registration",
    "test_faketensor",
 )
 ALL_OPCHECK_TEST_UTILS: Tuple[str, ...] = (
    "test_schema",
    "test_autograd_registration",
    "test_faketensor",
    "test_aot_dispatch_dynamic",
 )
 class QKVInputs(NamedTuple):
    '''
@@ -926,3 +942,19 @@ def assert_actual_matches_ideal(test_params: PhaseTestParameters,
    ideal_output = test_params.packed_qkvo.ideal_output
    torch.testing.assert_close(ideal_output,
                               output_under_test.view_as(ideal_output))
 def opcheck(op: Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket,
                      torch._library.custom_ops.CustomOpDef],
            args: Tuple[Any, ...],
            kwargs: Optional[Dict[str, Any]] = None,
            *,
            test_utils: Union[str, Sequence[str]] = ALL_OPCHECK_TEST_UTILS,
            raise_exception: bool = True,
            cond: bool = True) -> Dict[str, str]:
    return torch.library.opcheck(
        op,
        args,
        kwargs,
        test_utils=test_utils,
        raise_exception=raise_exception) if cond else {}
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -7,26 +7,6 @@ import pytest
 from tests.quantization.utils import is_quant_method_supported
 # In this test we hardcode prompts and generations for the model so we don't
 # need to require the AQLM package as a dependency
 example_prompts = [
    'vLLM is a high-throughput and memory-efficient inference and serving '
    'engine for LLMs.\n',
    'Briefly describe the major milestones in the development of artificial '
    'intelligence from 1950 to 2020.\n',
    'Compare and contrast artificial intelligence with human intelligence in '
    'terms of processing information.\n',
    'Describe the basic components of a neural network and how it can be '
    'trained.\n',
    'Write a short story about a robot that dreams for the first time.\n',
    'Analyze the impact of the COVID-19 pandemic on global economic structures '
    'and future business models.\n',
    'Explain the cultural significance of the Mona Lisa painting, and how its '
    'perception might vary in Western versus Eastern societies.\n',
    "Translate the following English sentence into Japanese, French, and "
    "Swahili: 'The early bird catches the worm.'\n"
 ]
 # These ground truth generations were generated using `transformers==4.38.1
 # aqlm==1.1.0 torch==2.2.0`
 # and the below code:
--- a/tests/models/test_internvl.py
+++ b/tests/models/test_internvl.py
@@ -1,5 +1,5 @@
 import types
-from typing import List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type, Union
 import pytest
 import torch
@@ -9,7 +9,8 @@ from transformers import AutoConfig
 from vllm.multimodal.utils import rescale_image_size
 from vllm.utils import is_cpu
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
                        _ImageAssets)
 from .utils import check_logprobs_close
 pytestmark = pytest.mark.vlm
@@ -20,6 +21,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "cherry_blossom":
    "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
 })
 HF_MULTIIMAGE_IMAGE_PROMPT = "<|im_start|>User\nImage-1: <image>\nImage-2: <image>\nDescribe the two images in detail.<|im_end|>\n<|im_start|>Assistant\n"  # noqa: E501
 models = [
    "OpenGVLab/InternVL2-1B",
@@ -64,13 +66,13 @@ def generate(
 def run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
-    image_assets: _ImageAssets,
+    inputs: List[Tuple[List[str], PromptImageInput]],
    model: str,
    *,
    size_factors: List[float],
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    mm_limit: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
@@ -83,12 +85,6 @@ def run_test(
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
    images = [asset.pil_image for asset in image_assets]
    inputs_per_image = [(
        [prompt for _ in size_factors],
        [rescale_image_size(image, factor) for factor in size_factors],
    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
@@ -110,13 +106,21 @@ def run_test(
            self.max_num = self.config.max_dynamic_patch
            self.image_size = self.vision_config.image_size
-        def __call__(self, text: str, images: Image, **kwargs):
+        def __call__(self, text: str, images: Union[Image, List[Image]],
                     **kwargs):
            from vllm.model_executor.models.internvl import (
                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
-            pixel_values = image_to_pixel_values(
+            images = [images] if isinstance(images, Image) else images
-                images, self.image_size, self.min_num, self.max_num,
+            pixel_values = [
-                self.use_thumbnail).to(self.dtype)
+                image_to_pixel_values(image, self.image_size, self.min_num,
-            num_patches_list = [pixel_values.shape[0]]
+                                      self.max_num,
                                      self.use_thumbnail).to(self.dtype)
                for image in images
            ]
            num_patches_list = [
                pixel_value.shape[0] for pixel_value in pixel_values
            ]
            pixel_values = torch.cat(pixel_values, dim=0)
            for num_patches in num_patches_list:
                context_tokens = IMG_CONTEXT * self.num_image_token \
                    * num_patches
@@ -130,6 +134,7 @@ def run_test(
    with vllm_runner(model,
                     max_model_len=4096,
                     dtype=dtype,
                     limit_mm_per_prompt={"image": mm_limit},
                     tensor_parallel_size=tensor_parallel_size,
                     distributed_executor_backend=distributed_executor_backend,
                     enforce_eager=True) as vllm_model:
@@ -138,7 +143,7 @@ def run_test(
                                                max_tokens,
                                                num_logprobs=num_logprobs,
                                                images=images)
-            for prompts, images in inputs_per_image
+            for prompts, images in inputs
        ]
    with hf_runner(model, dtype=dtype) as hf_model:
@@ -156,7 +161,7 @@ def run_test(
                                                    num_logprobs=num_logprobs,
                                                    images=hf_images,
                                                    eos_token_id=eos_token_id)
-            for prompts, hf_images in inputs_per_image
+            for prompts, hf_images in inputs
        ]
    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
@@ -264,15 +269,64 @@ if is_cpu():
@torch.inference_mode()
 def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
                dtype: str, max_tokens: int, num_logprobs: int) -> None:
    images = [asset.pil_image for asset in image_assets]
    inputs_per_image = [(
        [prompt for _ in size_factors],
        [rescale_image_size(image, factor) for factor in size_factors],
    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
    run_test(
        hf_runner,
        vllm_runner,
-        image_assets,
+        inputs_per_image,
        model,
        size_factors=size_factors,
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        mm_limit=1,
        tensor_parallel_size=1,
    )
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
    "size_factors",
    [
        # No image
        [],
        # Single-scale
        [1.0],
        # Single-scale, batched
        [1.0, 1.0, 1.0],
        # Multi-scale
        [0.5, 0.75, 1.0],
    ],
 )
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@torch.inference_mode()
 def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
                             size_factors, dtype: str, max_tokens: int,
                             num_logprobs: int) -> None:
    images = [asset.pil_image for asset in image_assets]
    inputs_per_case = [
        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
         [[rescale_image_size(image, factor) for image in images]
          for factor in size_factors])
    ]
    run_test(
        hf_runner,
        vllm_runner,
        inputs_per_case,
        model,
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        mm_limit=2,
        tensor_parallel_size=1,
    )
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type, overload
 import pytest
 from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
@@ -8,11 +8,14 @@ from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
                        _ImageAssets)
 from .utils import check_logprobs_close
 pytestmark = pytest.mark.vlm
 _LIMIT_IMAGE_PER_PROMPT = 4
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
    "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
@@ -52,6 +55,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
    return hf_output_ids, hf_output_str, out_logprobs
@overload
 def run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
@@ -64,6 +68,78 @@ def run_test(
    num_logprobs: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
    ...
@overload
 def run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
    image_assets: _ImageAssets,
    model: str,
    *,
    sizes: List[Tuple[int, int]],
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
    ...
 def run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
    image_assets: _ImageAssets,
    model: str,
    *,
    size_factors: Optional[List[float]] = None,
    sizes: Optional[List[Tuple[int, int]]] = None,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
    images = [asset.pil_image for asset in image_assets]
    if size_factors is not None:
        inputs_per_image = [(
            [prompt for _ in size_factors],
            [rescale_image_size(image, factor) for factor in size_factors],
        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
    elif sizes is not None:
        inputs_per_image = [(
            [prompt for _ in sizes],
            [image.resize(size) for size in sizes],
        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
    else:
        raise ValueError("You must provide either `size_factors` or `sizes`")
    _run_test(hf_runner,
              vllm_runner,
              inputs_per_image,
              model,
              dtype=dtype,
              max_tokens=max_tokens,
              num_logprobs=num_logprobs,
              tensor_parallel_size=tensor_parallel_size,
              distributed_executor_backend=distributed_executor_backend)
 def _run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
    inputs: List[Tuple[List[str], PromptImageInput]],
    model: str,
    *,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
    """Inference result should be the same between hf and vllm.
@@ -85,13 +161,6 @@ def run_test(
    else:
        mantis_processor = None
    images = [asset.pil_image for asset in image_assets]
    inputs_per_image = [(
        [prompt for _ in size_factors],
        [rescale_image_size(image, factor) for factor in size_factors],
    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
@@ -100,15 +169,18 @@ def run_test(
    # max_model_len should be greater than image_feature_size
    with vllm_runner(model,
                     dtype=dtype,
                     max_model_len=4096,
                     tensor_parallel_size=tensor_parallel_size,
                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
+                     enforce_eager=True,
                     limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
                                          }) as vllm_model:
        vllm_outputs_per_image = [
            vllm_model.generate_greedy_logprobs(prompts,
                                                max_tokens,
                                                num_logprobs=num_logprobs,
                                                images=images)
-            for prompts, images in inputs_per_image
+            for prompts, images in inputs
        ]
    if mantis_processor is not None:
@@ -131,7 +203,7 @@ def run_test(
                                                    max_tokens,
                                                    num_logprobs=num_logprobs,
                                                    images=images)
-            for prompts, images in inputs_per_image
+            for prompts, images in inputs
        ]
    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
@@ -181,6 +253,51 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
    )
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
 def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets,
                                      model, dtype, max_tokens,
                                      num_logprobs) -> None:
    stop_sign = image_assets[0].pil_image
    cherry_blossom = image_assets[1].pil_image
    inputs = [(
        [
            "USER: <image><image>\nDescribe 2 images.\nASSISTANT:",
            "USER: <image><image>\nDescribe 2 images.\nASSISTANT:",
            "USER: <image><image><image><image>\nDescribe 4 images.\nASSISTANT:",  # noqa: E501
            "USER: <image>\nWhat is the season?\nASSISTANT:",
        ],
        [
            [stop_sign, cherry_blossom],
            # Images with different sizes and aspect-ratios
            [
                rescale_image_size(stop_sign, 0.1),
                stop_sign,
            ],
            [
                stop_sign,
                rescale_image_size(stop_sign, 0.25),
                cherry_blossom.resize((183, 488)),
                cherry_blossom.resize((488, 183))
            ],
            cherry_blossom,
        ])]
    _run_test(
        hf_runner,
        vllm_runner,
        inputs,
        model,
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        tensor_parallel_size=1,
    )
@pytest.mark.parametrize("model", models)
 def test_context_length_too_short(vllm_runner, image_assets, model):
    images = [asset.pil_image for asset in image_assets]
--- a/tests/models/test_llava_next_video.py
+++ b/tests/models/test_llava_next_video.py
@@ -0,0 +1,236 @@
 from typing import List, Optional, Tuple, Type, overload
 import pytest
 import transformers
 from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
 from vllm.multimodal.utils import (rescale_video_size, resize_video,
                                   sample_frames_from_video)
 from vllm.sequence import SampleLogprobs
 from ..conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
 from .utils import check_logprobs_close
 pytestmark = pytest.mark.vlm
 _PREFACE = (
    "A chat between a curious human and an artificial intelligence assistant. "
    "The assistant gives helpful, detailed, and polite answers to the human's "
    "questions.")
 HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
    "sample_demo_1":
    f"{_PREFACE}USER: <video>\nWhy is this video funny? ASSISTANT:"
 })
 models = ["llava-hf/LLaVA-NeXT-Video-7B-hf"]
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
                                         Optional[SampleLogprobs]],
                      model: str):
    """Sanitize vllm output to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output
    config = AutoConfig.from_pretrained(model)
    video_token_id = config.video_token_index
    tokenizer = AutoTokenizer.from_pretrained(model)
    eos_token_id = tokenizer.eos_token_id
    hf_output_ids = [
        token_id for idx, token_id in enumerate(output_ids)
        if token_id != video_token_id or output_ids[idx - 1] != video_token_id
    ]
    assert output_str[0] == " "
    hf_output_str = output_str[1:]
    if hf_output_ids[-1] == eos_token_id:
        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
    return hf_output_ids, hf_output_str, out_logprobs
@overload
 def run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
    video_assets: _VideoAssets,
    model: str,
    *,
    size_factors: List[float],
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    num_frames: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
    ...
@overload
 def run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
    video_assets: _VideoAssets,
    model: str,
    *,
    sizes: List[Tuple[int, int]],
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    num_frames: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
    ...
 def run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
    video_assets: _VideoAssets,
    model: str,
    *,
    size_factors: Optional[List[float]] = None,
    sizes: Optional[List[Tuple[int, int]]] = None,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    num_frames: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
    videos = [
        sample_frames_from_video(asset.np_ndarrays, num_frames)
        for asset in video_assets
    ]
    for video in videos:
        print(video.shape)
    if size_factors is not None:
        inputs_per_video = [(
            [prompt for _ in size_factors],
            [rescale_video_size(video, factor) for factor in size_factors],
        ) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
    elif sizes is not None:
        inputs_per_video = [(
            [prompt for _ in sizes],
            [resize_video(video, size) for size in sizes],
        ) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
    else:
        raise ValueError("You must provide either `size_factors` or `sizes`")
    # max_model_len should be greater than image_feature_size
    with vllm_runner(model,
                     dtype=dtype,
                     max_model_len=4096,
                     tensor_parallel_size=tensor_parallel_size,
                     distributed_executor_backend=distributed_executor_backend,
                     enforce_eager=True) as vllm_model:
        vllm_outputs_per_video = [
            vllm_model.generate_greedy_logprobs(prompts,
                                                max_tokens,
                                                num_logprobs=num_logprobs,
                                                videos=videos)
            for prompts, videos in inputs_per_video
        ]
    with hf_runner(model, dtype=dtype,
                   auto_cls=AutoModelForVision2Seq) as hf_model:
        hf_outputs_per_video = [
            hf_model.generate_greedy_logprobs_limit(prompts,
                                                    max_tokens,
                                                    num_logprobs=num_logprobs,
                                                    videos=videos)
            for prompts, videos in inputs_per_video
        ]
    for hf_outputs, vllm_outputs in zip(hf_outputs_per_video,
                                        vllm_outputs_per_video):
        # TODO: Check whether using original CLIPVisionModel can improve
        # consistency against HF
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=[
                vllm_to_hf_output(vllm_output, model)
                for vllm_output in vllm_outputs
            ],
            name_0="hf",
            name_1="vllm",
        )
@pytest.mark.skipif(transformers.__version__ < "4.45",
                    reason="Waiting for next transformers release")
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
    "size_factors",
    [
        # No video
        [],
        # Single-scale
        [1.0],
        # Single-scale, batched
        [1.0, 1.0, 1.0],
        # Multi-scale
        [0.25, 0.5, 1.0],
    ],
 )
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("num_frames", [16])
 def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
                dtype, max_tokens, num_logprobs, num_frames) -> None:
    """Inference result should be the same between hf and vllm.
    All the image fixtures for the test is under tests/videos.
    For huggingface runner, we provide the np.ndarray as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding MultiModalConfig as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
    run_test(
        hf_runner,
        vllm_runner,
        video_assets,
        model,
        size_factors=size_factors,
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        num_frames=num_frames,
        tensor_parallel_size=1,
    )
@pytest.mark.skipif(transformers.__version__ < "4.45",
                    reason="Waiting for next transformers release")
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
    "sizes",
    [[(1669, 2560), (2560, 1669), (183, 488), (488, 183)]],
 )
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("num_frames", [16])
 def test_models_fixed_sizes(hf_runner, vllm_runner, video_assets, model, sizes,
                            dtype, max_tokens, num_logprobs,
                            num_frames) -> None:
    run_test(
        hf_runner,
        vllm_runner,
        video_assets,
        model,
        sizes=sizes,
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        num_frames=num_frames,
        tensor_parallel_size=1,
    )
--- a/tests/models/test_mistral.py
+++ b/tests/models/test_mistral.py
@@ -41,3 +41,43 @@ def test_models(
        name_0="hf",
        name_1="vllm",
    )
@pytest.mark.parametrize("model", MODELS[1:])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
 def test_mistral_format(
    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
    with vllm_runner(
            model,
            dtype=dtype,
            tokenizer_mode="auto",
            load_format="safetensors",
            config_format="hf",
    ) as hf_format_model:
        hf_format_outputs = hf_format_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)
    with vllm_runner(
            model,
            dtype=dtype,
            tokenizer_mode="mistral",
            load_format="mistral",
            config_format="mistral",
    ) as mistral_format_model:
        mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)
    check_logprobs_close(
        outputs_0_lst=hf_format_outputs,
        outputs_1_lst=mistral_format_outputs,
        name_0="hf",
        name_1="mistral",
    )
--- a/tests/models/test_modelopt.py
+++ b/tests/models/test_modelopt.py
@@ -0,0 +1,79 @@
 # flake8: noqa
 """Tests Model Optimizer fp8 models against ground truth generation
 Note: these tests will only pass on H100
 """
 import os
 from typing import List
 import pytest
 from transformers import AutoTokenizer
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 MAX_MODEL_LEN = 1024
 MODELS = ["nvidia/Llama-3.1-8B-Instruct-FP8"]
 EXPECTED_STRS_MAP = {
    "nvidia/Llama-3.1-8B-Instruct-FP8": [
        "You're referring to VLLM, a high-performance Large Language Model (LLM) inference and",
        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
        'The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and',
        'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
        '**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir',
        'The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to',
        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
        'Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる'
    ]
 }
 # This test compares against golden strings for exact match since
 # there is no baseline implementation to compare against
 # and is unstable w.r.t specifics of the fp8 implementation or
 # the hardware being run on.
 # Disabled to prevent it from breaking the build
@pytest.mark.skip(
    reason=
    "Prevent unstable test based on golden strings from breaking the build.")
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
                    reason="fp8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS)
 def test_models(example_prompts, model_name) -> None:
    model = LLM(
        model=model_name,
        max_model_len=MAX_MODEL_LEN,
        trust_remote_code=True,
        enforce_eager=True,
        quantization="modelopt",
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    formatted_prompts = [
        tokenizer.apply_chat_template([{
            "role": "user",
            "content": prompt
        }],
                                      tokenize=False,
                                      add_generation_prompt=True)
        for prompt in example_prompts
    ]
    params = SamplingParams(max_tokens=20, temperature=0)
    generations: List[str] = []
    # Note: these need to be run 1 at a time due to numerical precision,
    # since the expected strs were generated this way.
    for prompt in formatted_prompts:
        outputs = model.generate(prompt, params)
        generations.append(outputs[0].outputs[0].text)
    del model
    print(model_name, generations)
    expected_strs = EXPECTED_STRS_MAP[model_name]
    for i in range(len(example_prompts)):
        generated_str = generations[i]
        expected_str = expected_strs[i]
        assert expected_str == generated_str, (
            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -1,16 +1,15 @@
 import os
 import re
-from typing import List, Optional, Tuple, Type, Union
+from typing import List, Optional, Tuple, Type
 import pytest
 from PIL import Image
 from transformers import AutoTokenizer
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 from vllm.utils import is_cpu, is_hip
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner
+from ..conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from .utils import check_logprobs_close
 pytestmark = pytest.mark.vlm
@@ -60,8 +59,7 @@ if is_hip():
 def run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
-    inputs: List[Tuple[List[str], Union[List[Image.Image],
+    inputs: List[Tuple[List[str], PromptImageInput]],
                                        List[List[Image.Image]]]]],
    model: str,
    *,
    dtype: str,
--- a/tests/models/test_pixtral.py
+++ b/tests/models/test_pixtral.py
@@ -0,0 +1,64 @@
 """Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
 Run `pytest tests/models/test_mistral.py`.
 """
 import pytest
 from vllm.sampling_params import SamplingParams
 pytestmark = pytest.mark.vlm
 MODELS = ["mistralai/Pixtral-12B-2409"]
@pytest.mark.skip(
    reason=
    "Model is too big, test passed on A100 locally but will OOM on CI machine."
 )
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
 def test_models(
    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
    image_urls = [
        "https://picsum.photos/id/237/200/300",
        "https://picsum.photos/seed/picsum/200/300"
    ]
    expected = [
        "The image depicts a black dog lying on a wooden surface, looking directly at the camera with a calm expression.",  # noqa
        "The image depicts a serene landscape with a snow-covered mountain under a pastel-colored sky during sunset."  # noqa
    ]
    prompt = "Describe the image in one short sentence."
    sampling_params = SamplingParams(max_tokens=512, temperature=0.0)
    with vllm_runner(model, dtype=dtype,
                     tokenizer_mode="mistral") as vllm_model:
        for i, image_url in enumerate(image_urls):
            messages = [
                {
                    "role":
                    "user",
                    "content": [{
                        "type": "text",
                        "text": prompt
                    }, {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url
                        }
                    }]
                },
            ]
            outputs = vllm_model.model.chat(messages,
                                            sampling_params=sampling_params)
            assert outputs[0].outputs[0].text == expected[i]
--- a/tests/models/test_qwen.py
+++ b/tests/models/test_qwen.py
@@ -1,19 +1,154 @@
-from typing import Type
+import pathlib
 from typing import List, Optional, Type
 import pytest
-from ..conftest import HfRunner, VllmRunner
+from vllm.multimodal.utils import rescale_image_size
 from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
 from .utils import check_logprobs_close
-models = ["qwen/qwen-vl"]
+pytestmark = pytest.mark.vlm
 text_only_models = [
    "Qwen/Qwen-7B-Chat"  # Has no visual component
 ]
 multimodal_models = ["Qwen/Qwen-VL"]
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
    "Picture 1: <img></img>\nWhat's the content of the image?: ",
    "cherry_blossom":
    "Picture 1: <img></img>\nWhat is the season?: ",
 })
-@pytest.mark.parametrize("dtype", ["half"])
+### Tests for multimodal Qwen models
 def run_test(
    tmp_path: pathlib.PosixPath,
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
    image_assets: _ImageAssets,
    model: str,
    *,
    size_factors: List[float],
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
    """Inference result should be the same between hf and vllm.
    All the image fixtures for the test is under tests/images.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects
    and corresponding MultiModalConfig as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
    images = [asset.pil_image for asset in image_assets]
    # Export the images to a tempdir and substitute it into the hf prompt;
    # the contents between <img>/</img> will be ignored by VLLM, but the
    # transformers implementation for the visual transformer parses this to
    # reload it in the forward call; the contents are treated as a URL or a
    # local path.
    for idx, asset in enumerate(image_assets):
        image_tmp_path = tmp_path / f"{asset.name}.jpg"
        asset.pil_image.save(image_tmp_path)
        HF_IMAGE_PROMPTS[idx] = HF_IMAGE_PROMPTS[idx].replace(
            "<img></img>", f"<img>{image_tmp_path}</img>")
    inputs_per_image = [(
        [prompt for _ in size_factors],
        [rescale_image_size(image, factor) for factor in size_factors],
    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
    # max_model_len should be greater than image_feature_size
    # Qwen encodes images into a fixed content size of 256
    with vllm_runner(model,
                     max_model_len=300,
                     max_num_seqs=1,
                     dtype=dtype,
                     tensor_parallel_size=tensor_parallel_size,
                     distributed_executor_backend=distributed_executor_backend,
                     enforce_eager=True) as vllm_model:
        vllm_outputs_per_image = [
            vllm_model.generate_greedy_logprobs(prompts,
                                                max_tokens,
                                                num_logprobs=num_logprobs,
                                                images=images)
            for prompts, images in inputs_per_image
        ]
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs_per_image = [
            hf_model.generate_greedy_logprobs_limit(prompts,
                                                    max_tokens,
                                                    num_logprobs=num_logprobs,
                                                    images=images)
            for prompts, images in inputs_per_image
        ]
    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
                                        vllm_outputs_per_image):
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_outputs,
            name_0="hf",
            name_1="vllm",
        )
@pytest.mark.parametrize("model", multimodal_models)
@pytest.mark.parametrize(
    "size_factors",
    [
        # No image
        [],
        # Single-scale
        [1.0],
        # Single-scale, batched
        [1.0, 1.0, 1.0],
        # Multi-scale
        [0.25, 0.5, 1.0],
    ],
 )
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [8])
@pytest.mark.parametrize("num_logprobs", [5])
 def test_multimodal_models(tmp_path, hf_runner, vllm_runner, image_assets,
                           model, size_factors, dtype, max_tokens,
                           num_logprobs) -> None:
    run_test(
        tmp_path,
        hf_runner,
        vllm_runner,
        image_assets,
        model,
        size_factors=size_factors,
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        tensor_parallel_size=1,
    )
 # Ensure that a text-only Qwen model can still be loaded and
 # used for inference in VLLM without throwing.
@pytest.mark.parametrize("model", text_only_models)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("model", models)
+def test_text_only_qwen_model_can_be_loaded_and_run(
 def test_text_only_qwen_model(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
    example_prompts,
    model: str,
@@ -22,27 +157,9 @@ def test_text_only_qwen_model(
    max_tokens: int,
    num_logprobs: int,
 ):
    # This test checks language inputs only, since the visual component
    # for qwen-vl is still unsupported in VLLM. In the near-future, the
    # implementation and this test will be extended to consider
    # visual inputs as well.
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts,
            max_tokens,
            num_logprobs=num_logprobs,
        )
    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy_logprobs(
+        vllm_model.generate_greedy_logprobs(
            example_prompts,
            max_tokens,
            num_logprobs=num_logprobs,
        )
    check_logprobs_close(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -1,9 +1,14 @@
 import pytest
 import transformers
 from vllm.model_executor.models import _MODELS, ModelRegistry
@pytest.mark.parametrize("model_cls", _MODELS)
 def test_registry_imports(model_cls):
    if (model_cls == "Qwen2VLForConditionalGeneration"
            and transformers.__version__ < "4.45"):
        pytest.skip("Waiting for next transformers release")
    # Ensure all model classes can be imported successfully
    ModelRegistry.resolve_model_cls([model_cls])
--- a/tests/multi_step/test_correctness_llm.py
+++ b/tests/multi_step/test_correctness_llm.py
@@ -57,7 +57,7 @@ def test_multi_step_llm(
                           GPU -> CPU output transfer
      num_prompts: number of example prompts under test
      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
-                    completions endpoint; `None` -> no logprobs
+                    completions endpoint; `None` -> 1 logprob returned.
    """
    prompts = example_prompts
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -56,7 +56,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
        assert qkv_proj.weight_scale.dtype is torch.float32
        assert qkv_proj.input_scale.dtype is torch.float32
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
        assert output
@@ -85,7 +85,7 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args):
        assert qkv_proj.scheme.strategy == strategy
        assert qkv_proj.weight.dtype is torch.int8
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
        assert output
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -1,224 +1,54 @@
 import asyncio
 import os
 from itertools import cycle
-from typing import Dict, List, Optional, Sequence, Tuple, Union
+from typing import List, Optional, Tuple
 import pytest
 import ray
 import torch
-from vllm import LLM
+from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.utils import set_random_seed
 from vllm.multimodal import MultiModalDataDict
 from vllm.outputs import RequestOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import Logprob
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter, random_uuid
 from ...conftest import cleanup
-from ...utils import wait_for_gpu_memory_to_clear
+from ...models.utils import check_logprobs_close, check_outputs_equal
 from ...utils import RemoteOpenAIServer
-
+PROMPTS = [
-class AsyncLLM:
+    "Hello, my name is",
-    """AsyncLLM
+    "The president of the United States is",
-
+    "The capital of France is",
-    Note: Current LLM class in vllm don't support async mode, for test purpose,
+    "The future of AI is",
-    we implement async one in here. Maybe we could move to
+    "San Francisco is know for its",
-    vllm/entrypoints/llm.py in future.
+    "Facebook was created in 2004 by",
-
+    "Curious George is a",
-    Below AsyncLLM is directly borrow from vllm/entrypoints/llm.py with changes
+    "Python 3.11 brings improvements to its",
-    to make to work in async mode.
+]
    """
    def __init__(
        self,
        model: str,
        tokenizer: Optional[str] = None,
        tokenizer_mode: str = "auto",
        skip_tokenizer_init: bool = False,
        trust_remote_code: bool = False,
        tensor_parallel_size: int = 1,
        dtype: str = "auto",
        quantization: Optional[str] = None,
        revision: Optional[str] = None,
        tokenizer_revision: Optional[str] = None,
        seed: int = 0,
        gpu_memory_utilization: float = 0.9,
        swap_space: int = 4,
        enforce_eager: bool = False,
        max_seq_len_to_capture: int = 8192,
        disable_custom_all_reduce: bool = False,
        **kwargs,
    ) -> None:
        if "disable_log_stats" not in kwargs:
            kwargs["disable_log_stats"] = True
        # Needed to engine_use_ray works as a deprecated feature,
        # otherwise the following constructor will raise an exception
        os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
        engine_args = AsyncEngineArgs(
            model=model,
            tokenizer=tokenizer,
            tokenizer_mode=tokenizer_mode,
            skip_tokenizer_init=skip_tokenizer_init,
            trust_remote_code=trust_remote_code,
            tensor_parallel_size=tensor_parallel_size,
            dtype=dtype,
            quantization=quantization,
            revision=revision,
            tokenizer_revision=tokenizer_revision,
            seed=seed,
            gpu_memory_utilization=gpu_memory_utilization,
            swap_space=swap_space,
            enforce_eager=enforce_eager,
            max_seq_len_to_capture=max_seq_len_to_capture,
            # For now use ray for the distributed back-end, since
            # we rely on the use of engine_use_ray=True to avoid
            # reinitializing CUDA in the same process (driver worker)
            engine_use_ray=True,
            distributed_executor_backend="ray",
            disable_custom_all_reduce=disable_custom_all_reduce,
            **kwargs,
        )
        self.request_counter = Counter()
        self.llm_engine = AsyncLLMEngine.from_engine_args(
            engine_args, usage_context=UsageContext.LLM_CLASS)
    def generate(
        self,
        prompts: Optional[Union[str, List[str]]] = None,
        sampling_params: Optional[Union[SamplingParams,
                                        List[SamplingParams]]] = None,
        prompt_token_ids: Optional[List[List[int]]] = None,
        use_tqdm: bool = True,
        lora_request: Optional[LoRARequest] = None,
        multi_modal_data: Optional[MultiModalDataDict] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None
    ) -> List[RequestOutput]:
        if prompts is None:
            raise ValueError("prompts must be provided.")
        if isinstance(prompts, str):
            # Convert a single prompt to a list.
            prompts = [prompts]
        if prompts is not None:
            num_requests = len(prompts)
        if sampling_params is None:
            # Use default sampling params.
            sampling_params = SamplingParams()
        elif isinstance(sampling_params,
                        list) and len(sampling_params) != num_requests:
            raise ValueError("The lengths of prompts and "
                             "sampling_params must be the same.")
        async def get_output(prompt, sampling_param) -> RequestOutput:
            request_id = random_uuid()
            results_generator = self.llm_engine.generate(
                prompt, sampling_param, request_id)
            final_output = None
            async for request_output in results_generator:
                final_output = request_output
            assert final_output is not None
            return final_output
        outputs: List[RequestOutput] = []
        try:
            for i in range(num_requests):
                prompt = prompts[i] if prompts is not None else None
                params = sampling_params[i] if isinstance(
                    sampling_params, Sequence) else sampling_params
                res = asyncio.run(get_output(prompt, params))
                outputs.append(res)
        finally:
            ray.shutdown()
        return outputs
@pytest.fixture
-def baseline_llm_generator(request, common_llm_kwargs,
+def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
                           per_test_common_llm_kwargs, baseline_llm_kwargs,
                           seed):
    return create_llm_generator("baseline", request, common_llm_kwargs,
                                per_test_common_llm_kwargs,
                                baseline_llm_kwargs, seed)
@pytest.fixture
 def test_llm_generator(request, common_llm_kwargs, per_test_common_llm_kwargs,
                       test_llm_kwargs, seed):
    return create_llm_generator("test", request, common_llm_kwargs,
                                per_test_common_llm_kwargs, test_llm_kwargs,
                                seed)
    def generate():
        kwargs = {
            **common_llm_kwargs,
            **per_test_common_llm_kwargs,
            **test_llm_kwargs,
        }
-def create_llm_generator(baseline_or_test, request, common_llm_kwargs,
+        llm = LLM(**kwargs)
                         per_test_common_llm_kwargs, distinct_llm_kwargs,
                         seed):
    kwargs = {
        **common_llm_kwargs,
        **per_test_common_llm_kwargs,
        **distinct_llm_kwargs,
    }
    test_name = request.node.name
    model = kwargs["model"]
    draft_model = kwargs.get("speculative_model", None)
    same_draft_target_model = (draft_model is not None
                               and draft_model == model)
    def generator_inner():
        wait_for_gpu_memory_to_clear(
            devices=list(range(torch.cuda.device_count())),
            threshold_bytes=2 * 2**30,
            timeout_s=60,
        )
        use_async = False
        if "use_async" in kwargs:
            use_async = kwargs.pop("use_async")
        print(f'{use_async=}')
        print(f'Creating {baseline_or_test=} LLM for {test_name=}. {kwargs=}')
        llm = AsyncLLM(**kwargs) if use_async else LLM(**kwargs)
        # Override logging interval to 0 for spec decode test run to
        # log all metrics in time.
        if (baseline_or_test == "test" and not use_async
                and llm.llm_engine.log_stats):
            for sate_logger in llm.llm_engine.stat_loggers.values():
                sate_logger.local_interval = 0
        if seed is not None:
            set_random_seed(seed)
        yield llm
        del llm
        cleanup()
-    def generator_outer():
+    return generate
        for llm in generator_inner():
            yield llm
            del llm
    # Set an attribute to the generator_outer function to allow us to
    # determine whether to further check the acceptance rate in tests.
    generator_outer.same_draft_target_model = same_draft_target_model  # type: ignore
    return generator_outer
 def maybe_assert_ngram_worker(llm):
    # Verify the proposer worker is ngram if ngram is specified.
-    if (not isinstance(llm, AsyncLLM)
+    if (llm.llm_engine.speculative_config is not None
            and llm.llm_engine.speculative_config is not None
            and llm.llm_engine.speculative_config.ngram_prompt_lookup_max > 0):
        from vllm.spec_decode.ngram_worker import NGramWorker
        assert isinstance(
@@ -251,118 +81,165 @@ def get_output_from_llm_generator(
    return tokens, token_ids, acceptance_rate
-def get_logprobs_from_llm_generator(
+def run_logprob_correctness_test(vllm_runner,
-        llm_generator, prompts,
+                                 common_llm_kwargs,
-        sampling_params) -> List[List[Dict[int, Logprob]]]:
+                                 per_test_common_llm_kwargs,
-    """Returns a dict of (token_id: Logprob) for each generated position, for
+                                 baseline_llm_kwargs,
-    each sequence in the batch.
+                                 test_llm_kwargs,
-    """
+                                 batch_size: int,
-    for llm in llm_generator():
+                                 max_output_len: int,
-        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
+                                 seed: Optional[int] = 0,
-        logprobs = [output.outputs[0].logprobs[:] for output in outputs]
+                                 temperature: float = 0.0,
-        del llm
+                                 logprobs: int = 1):
    org_args = {
        **common_llm_kwargs,
        **per_test_common_llm_kwargs,
        **baseline_llm_kwargs,
    }
-    return logprobs
+    sd_args = {
        **common_llm_kwargs,
        **per_test_common_llm_kwargs,
        **test_llm_kwargs,
    }
    prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))]
    sampling_params = SamplingParams(temperature=temperature,
                                     max_tokens=max_output_len,
                                     seed=seed,
                                     logprobs=logprobs)
    with vllm_runner(**org_args) as vllm_model:
        org_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)
    with vllm_runner(**sd_args) as vllm_model:
        sd_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)
    check_logprobs_close(outputs_0_lst=org_outputs,
                         outputs_1_lst=sd_outputs,
                         name_0="org",
                         name_1="sd")
-def run_greedy_equality_correctness_test(baseline_llm_generator,
+def run_equality_correctness_test(
-                                         test_llm_generator,
+        vllm_runner,
-                                         batch_size,
+        common_llm_kwargs,
-                                         max_output_len,
+        per_test_common_llm_kwargs,
-                                         force_output_len: bool,
+        baseline_llm_kwargs,
-                                         print_tokens: bool = False,
+        test_llm_kwargs,
-                                         ensure_all_accepted: bool = False):
+        batch_size: int,
        max_output_len: int,
        seed: Optional[int] = 0,
        temperature: float = 0.0,
        disable_seed: bool = False,
        ignore_eos: bool = True,
        ensure_all_accepted: bool = False,
        expected_acceptance_rate: Optional[float] = None):
    org_args = {
        **common_llm_kwargs,
        **per_test_common_llm_kwargs,
        **baseline_llm_kwargs,
    }
    sd_args = {
        **common_llm_kwargs,
        **per_test_common_llm_kwargs,
        **test_llm_kwargs,
    }
    prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))]
    if disable_seed:
        seed = None
    sampling_params = SamplingParams(temperature=temperature,
                                     max_tokens=max_output_len,
                                     seed=seed,
                                     ignore_eos=ignore_eos)
    with vllm_runner(**org_args) as vllm_model:
        org_outputs = vllm_model.generate(prompts, sampling_params)
    with vllm_runner(**sd_args) as vllm_model:
        if ensure_all_accepted or expected_acceptance_rate is not None:
            # Force log interval to be 0 to catch all metrics.
            stat_logger = vllm_model.model.llm_engine.stat_loggers[
                'prometheus']
            stat_logger.local_interval = -100
        sd_outputs = vllm_model.generate(prompts, sampling_params)
        if ensure_all_accepted or expected_acceptance_rate is not None:
            acceptance_rate = (stat_logger.metrics.
                               gauge_spec_decode_draft_acceptance_rate.labels(
                                   **stat_logger.labels)._value.get())
            if ensure_all_accepted:
                assert True
                # FIXME: ci fails to log acceptance rate.
                # It works locally.
                # assert acceptance_rate == 1.0
            if expected_acceptance_rate is not None:
                assert acceptance_rate >= expected_acceptance_rate - 1e-2
    check_outputs_equal(outputs_0_lst=org_outputs,
                        outputs_1_lst=sd_outputs,
                        name_0="org",
                        name_1="sd")
 def run_equality_correctness_test_tp(model,
                                     common_llm_kwargs,
                                     per_test_common_llm_kwargs,
                                     baseline_llm_kwargs,
                                     test_llm_kwargs,
                                     batch_size: int,
                                     max_output_len: int,
                                     seed: int = 0,
                                     temperature: float = 0.0):
    """Helper method that compares the outputs of both the baseline LLM and
    the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
    the same when temperature is zero.
    """
    arg1 = common_llm_kwargs + per_test_common_llm_kwargs + baseline_llm_kwargs
    arg2 = common_llm_kwargs + per_test_common_llm_kwargs + test_llm_kwargs
    env1 = env2 = None
-    run_equality_correctness_test(baseline_llm_generator,
+    max_wait_seconds = 240
-                                  test_llm_generator,
+    results = []
                                  batch_size,
                                  max_output_len,
                                  force_output_len,
                                  temperature=0.0,
                                  seeded=False,
                                  print_tokens=print_tokens,
                                  ensure_all_accepted=ensure_all_accepted)
    prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))]
-def run_equality_correctness_test(
+    for args, env in ((arg1, env1), (arg2, env2)):
-        baseline_llm_generator,
+        with RemoteOpenAIServer(model,
-        test_llm_generator,
+                                args,
-        batch_size,
+                                env_dict=env,
-        max_output_len,
+                                max_wait_seconds=max_wait_seconds) as server:
-        force_output_len: bool,
+            client = server.get_client()
        temperature: float,
        seeded: bool,
        print_tokens: bool = False,
        ensure_all_accepted: bool = False,
        expected_acceptance_rate: Optional[float] = None):
    """Helper method that compares the outputs of both the baseline LLM and
    the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
    the same when temperature is zero (or when temperature is > 0 and seeded).
    """
-    prompts = [
+            completion = client.completions.create(model=model,
-        "Hello, my name is",
+                                                   prompt=prompts,
-        "The president of the United States is",
+                                                   max_tokens=max_output_len,
-        "The capital of France is",
+                                                   seed=seed,
-        "The future of AI is",
+                                                   temperature=temperature)
        "San Francisco is know for its",
        "Facebook was created in 2004 by",
        "Curious George is a",
        "Python 3.11 brings improvements to its",
    ]
-    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+            results.append({
                "test":
                "seeded_sampling",
                "text": [choice.text for choice in completion.choices],
                "finish_reason":
                [choice.finish_reason for choice in completion.choices],
                "usage":
                completion.usage,
            })
-    # If the test requires that we generated max_output_len tokens, then set the
+    n = len(results) // 2
-    # sampling params to ignore eos token.
+    arg1_results = results[:n]
-    ignore_eos = force_output_len
+    arg2_results = results[n:]
-
+    for arg1_result, arg2_result in zip(arg1_results, arg2_results):
-    if seeded:
+        assert arg1_result == arg2_result, (
-        sampling_params = [
+            f"Results for {model=} are not the same with {arg1=} and {arg2=}. "
-            SamplingParams(
+            f"{arg1_result=} != {arg2_result=}")
                max_tokens=max_output_len,
                ignore_eos=ignore_eos,
                temperature=temperature,
                seed=i,
            ) for i in range(len(prompts))
        ]
    else:
        sampling_params = SamplingParams(
            max_tokens=max_output_len,
            ignore_eos=ignore_eos,
            temperature=temperature,
        )
    (spec_batch_tokens, spec_batch_token_ids,
     acceptance_rate) = get_output_from_llm_generator(test_llm_generator,
                                                      prompts, sampling_params)
    (baseline_batch_tokens, baseline_batch_token_ids,
     _) = get_output_from_llm_generator(baseline_llm_generator, prompts,
                                        sampling_params)
    assert len(baseline_batch_token_ids) == len(prompts)
    assert len(spec_batch_token_ids) == len(prompts)
    for i, (baseline_token_ids, baseline_tokens, spec_token_ids,
            spec_tokens) in enumerate(
                zip(baseline_batch_token_ids, baseline_batch_tokens,
                    spec_batch_token_ids, spec_batch_tokens)):
        if print_tokens:
            print(f'{i=} {baseline_tokens=}')
            print(f'{i=}     {spec_tokens=}')
        print(f'{i=} {baseline_token_ids=}')
        print(f'{i=}     {spec_token_ids=}')
        assert baseline_token_ids == spec_token_ids
    print(f'{acceptance_rate=}')
    if ensure_all_accepted:
        assert acceptance_rate == 1.0
    if expected_acceptance_rate is not None:
        assert acceptance_rate >= expected_acceptance_rate - 1e-2
--- a/tests/spec_decode/e2e/test_eagle_correctness.py
+++ b/tests/spec_decode/e2e/test_eagle_correctness.py
@@ -21,7 +21,7 @@ correctess for the target model outputs.
 import pytest
-from .conftest import run_greedy_equality_correctness_test
+from .conftest import run_equality_correctness_test
 # main model
 MAIN_MODEL = "JackFram/llama-68m"
@@ -53,7 +53,7 @@ PRECISION = "float32"
        "dtype": PRECISION,
        # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -68,15 +68,16 @@ PRECISION = "float32"
 ])
@pytest.mark.parametrize("batch_size", [1, 32])
@pytest.mark.parametrize("seed", [1])
-def test_eagle_e2e_greedy_correctness(baseline_llm_generator,
+def test_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                      test_llm_generator, batch_size: int,
+                                      per_test_common_llm_kwargs,
-                                      output_len: int):
+                                      baseline_llm_kwargs, test_llm_kwargs,
-    """Verify greedy equality with different batch size."""
+                                      batch_size: int, output_len: int,
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                      seed: int):
-                                         test_llm_generator,
+
-                                         batch_size,
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                         max_output_len=output_len,
+                                  per_test_common_llm_kwargs,
-                                         force_output_len=True)
+                                  baseline_llm_kwargs, test_llm_kwargs,
                                  batch_size, output_len, seed)
@pytest.mark.parametrize(
@@ -94,7 +95,7 @@ def test_eagle_e2e_greedy_correctness(baseline_llm_generator,
        "dtype": PRECISION,
        # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -109,17 +110,16 @@ def test_eagle_e2e_greedy_correctness(baseline_llm_generator,
 ])
@pytest.mark.parametrize("batch_size", [1, 32])
@pytest.mark.parametrize("seed", [1])
-def test_eagle_e2e_greedy_correctness_cuda_graph(baseline_llm_generator,
+def test_eagle_e2e_greedy_correctness_cuda_graph(
-                                                 test_llm_generator,
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-                                                 batch_size: int,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-                                                 output_len: int):
+        seed: int):
-    """Verify greedy equality with cuda graph enabled and different 
+    """Verify greedy equality with cuda graph enabled and different
    batch sizes."""
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                         test_llm_generator,
+                                  per_test_common_llm_kwargs,
-                                         batch_size,
+                                  baseline_llm_kwargs, test_llm_kwargs,
-                                         max_output_len=output_len,
+                                  batch_size, output_len, seed)
                                         force_output_len=True)
@pytest.mark.parametrize(
@@ -140,7 +140,7 @@ def test_eagle_e2e_greedy_correctness_cuda_graph(baseline_llm_generator,
        "dtype": PRECISION,
        # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -158,18 +158,17 @@ def test_eagle_e2e_greedy_correctness_cuda_graph(baseline_llm_generator,
    ])
@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize("seed", [1])
-def test_eagle_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
+def test_eagle_e2e_greedy_correctness_with_preemption(
-                                                      test_llm_generator,
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-                                                      batch_size: int,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-                                                      output_len: int):
+        seed: int):
    """Verify greedy equality, even when some sequences are preempted mid-
    generation.
    """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                         test_llm_generator,
+                                  per_test_common_llm_kwargs,
-                                         batch_size,
+                                  baseline_llm_kwargs, test_llm_kwargs,
-                                         max_output_len=output_len,
+                                  batch_size, output_len, seed)
                                         force_output_len=True)
@pytest.mark.parametrize(
@@ -185,7 +184,7 @@ def test_eagle_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
        "dtype": PRECISION,
        # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -207,16 +206,17 @@ def test_eagle_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
        32,
    ])
@pytest.mark.parametrize("seed", [1])
-def test_eagle_different_k(baseline_llm_generator, test_llm_generator,
+def test_eagle_different_k(vllm_runner, common_llm_kwargs,
-                           batch_size: int, output_len: int):
+                           per_test_common_llm_kwargs, baseline_llm_kwargs,
                           test_llm_kwargs, batch_size: int, output_len: int,
                           seed: int):
    """Verify that eagle speculative decoding produces exact equality
    to without spec decode with different values of num_speculative_tokens.
    """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                         test_llm_generator,
+                                  per_test_common_llm_kwargs,
-                                         batch_size,
+                                  baseline_llm_kwargs, test_llm_kwargs,
-                                         max_output_len=output_len,
+                                  batch_size, output_len, seed)
                                         force_output_len=True)
@pytest.mark.parametrize(
@@ -232,7 +232,7 @@ def test_eagle_different_k(baseline_llm_generator, test_llm_generator,
        "dtype": PRECISION,
        # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -250,17 +250,18 @@ def test_eagle_different_k(baseline_llm_generator, test_llm_generator,
        32,
    ])
@pytest.mark.parametrize("seed", [1])
-def test_eagle_disable_queue(baseline_llm_generator, test_llm_generator,
+def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
-                             batch_size: int, output_len: int):
+                             per_test_common_llm_kwargs, baseline_llm_kwargs,
                             test_llm_kwargs, batch_size: int, output_len: int,
                             seed: int):
    """Verify that eagle speculative decoding produces exact equality
    to without spec decode when speculation is disabled for large
    batch sizes.
    """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                         test_llm_generator,
+                                  per_test_common_llm_kwargs,
-                                         batch_size,
+                                  baseline_llm_kwargs, test_llm_kwargs,
-                                         max_output_len=output_len,
+                                  batch_size, output_len, seed)
                                         force_output_len=True)
 if __name__ == "__main__":
--- a/tests/spec_decode/e2e/test_integration.py
+++ b/tests/spec_decode/e2e/test_integration.py
@@ -4,7 +4,9 @@ other features, e.g. cuda graphs.
 import pytest
-from .conftest import run_greedy_equality_correctness_test
+from .conftest import run_equality_correctness_test
 MAIN_MODEL = "JackFram/llama-68m"
@pytest.mark.parametrize(
@@ -15,7 +17,7 @@ from .conftest import run_greedy_equality_correctness_test
        # Verify equality when cuda graphs allowed.
        "enforce_eager": False,
-        "model": "JackFram/llama-68m",
+        "model_name": "JackFram/llama-68m",
    }])
@pytest.mark.parametrize(
    "per_test_common_llm_kwargs",
@@ -31,23 +33,27 @@ from .conftest import run_greedy_equality_correctness_test
@pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize("output_len", [32])
@pytest.mark.parametrize("seed", [1])
-def test_spec_decode_cuda_graph(baseline_llm_generator, test_llm_generator,
+def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
-                                batch_size, output_len):
+                                per_test_common_llm_kwargs,
                                baseline_llm_kwargs, test_llm_kwargs,
                                batch_size: int, output_len: int, seed: int):
    """Verify spec decode equality when cuda graphs are enabled.
    """
-    run_greedy_equality_correctness_test(
+    run_equality_correctness_test(vllm_runner,
-        baseline_llm_generator,
+                                  common_llm_kwargs,
-        test_llm_generator,
+                                  per_test_common_llm_kwargs,
-        batch_size,
+                                  baseline_llm_kwargs,
-        max_output_len=output_len,
+                                  test_llm_kwargs,
-        force_output_len=True,
+                                  batch_size,
-    )
+                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0)
@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "model": "JackFram/llama-160m",
+        "model_name": "JackFram/llama-160m",
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
@@ -80,13 +86,19 @@ def test_spec_decode_cuda_graph(baseline_llm_generator, test_llm_generator,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1])
-def test_speculative_model_quantization_config(baseline_llm_generator,
+def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
-                                               test_llm_generator,
+                                               per_test_common_llm_kwargs,
-                                               batch_size: int):
+                                               baseline_llm_kwargs,
                                               test_llm_kwargs,
                                               batch_size: int, seed: int):
    """Verify spec decode works well with draft model quantization configs.
    """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner,
-                                         test_llm_generator,
+                                  common_llm_kwargs,
-                                         batch_size,
+                                  per_test_common_llm_kwargs,
-                                         max_output_len=32,
+                                  baseline_llm_kwargs,
-                                         force_output_len=True)
+                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=32,
                                  seed=seed,
                                  temperature=0.0)
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -7,42 +7,39 @@ import torch
 from vllm.utils import is_hip
-from .conftest import run_greedy_equality_correctness_test
+from .conftest import run_equality_correctness_test_tp
@pytest.mark.skipif(torch.cuda.device_count() < 2,
                    reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize(
    "common_llm_kwargs",
-    [{
+    [[
        "model": "JackFram/llama-68m",
        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
+        "--enforce-eager",
        # Required for spec decode.
-        "use_v2_block_manager": True,
+        "--use-v2-block-manager",
-        "tensor_parallel_size": 2,
+        "--tensor-parallel-size",
-
+        "2"
-        # Use AsyncLLM engine, so that the engine runs in its own process.
+    ]])
-        # Otherwise, since vLLM does not follow true SPMD, the test runner
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
-        # process will have both the engine and the rank0 worker. NCCL is not
+@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
        # cleaned up properly, and its server host thread leaks, causing the
        # second run of the test to fail with internal NCCL error.
        "use_async": True,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
-    {
+    [
-        "speculative_model": "JackFram/llama-68m",
+        "--speculative-model",
-        "num_speculative_tokens": 3,
+        "JackFram/llama-68m",
-    },
+        "--num-speculative-tokens",
-    {
+        "3",
-        "speculative_model": "[ngram]",
+    ],
-        "num_speculative_tokens": 5,
+    [
-        "ngram_prompt_lookup_max": 3,
+        "--speculative-model",
-    },
+        "[ngram]",
        "--num-speculative-tokens",
        "5",
        "--ngram-prompt-lookup-max",
        "3",
    ],
 ])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize(
@@ -52,75 +49,75 @@ from .conftest import run_greedy_equality_correctness_test
        32,
    ])
@pytest.mark.parametrize("seed", [1])
-def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
+def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
-                              batch_size: int, output_len: int):
+                              baseline_llm_kwargs, test_llm_kwargs,
                              batch_size: int, output_len: int, seed: int):
    """Verify greedy equality when tensor parallelism is used.
    """
    if is_hip():
        pytest.skip("hip is not well-supported yet")
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test_tp("JackFram/llama-68m",
-                                         test_llm_generator,
+                                     common_llm_kwargs,
-                                         batch_size,
+                                     per_test_common_llm_kwargs,
-                                         max_output_len=output_len,
+                                     baseline_llm_kwargs,
-                                         force_output_len=True)
+                                     test_llm_kwargs,
                                     batch_size,
                                     output_len,
                                     seed,
                                     temperature=0.0)
@pytest.mark.skipif(torch.cuda.device_count() < 2,
                    reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize(
    "common_llm_kwargs",
-    [{
+    [[
        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
+        "--enforce-eager",
        # Required for spec decode.
-        "use_v2_block_manager": True,
+        "--use_v2_block_manager",
-        "tensor_parallel_size": 2,
+        "--tensor_parallel_size",
-
+        "2",
        # Use AsyncLLM engine, so that the engine runs in its own process.
        # Otherwise, since vLLM does not follow true SPMD, the test runner
        # process will have both the engine and the rank0 worker. NCCL is not
        # cleaned up properly, and its server host thread leaks, causing the
        # second run of the test to fail with internal NCCL error.
        "use_async": True,
        # precision
-        "dtype": "float32",
+        "--dtype",
-    }])
+        "bfloat16",
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+    ]])
-@pytest.mark.parametrize(
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
-    "per_test_common_llm_kwargs, test_llm_kwargs",
+@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
-    [
+@pytest.mark.parametrize("model, test_llm_kwargs",
-        (
+                         [("JackFram/llama-68m", [
-            {
+                             "--speculative-model",
-                # Use a small model for a fast test.
+                             "JackFram/llama-68m",
-                # Note this is repeated in the test body; to initialize a
+                             "--num_speculative-tokens",
-                # tokenizer.
+                             "5",
-                "model": "JackFram/llama-68m",
+                             "--speculative-draft-tensor-parallel-size",
-            },
+                             "1",
-            {
+                         ]),
-                "speculative_model": "JackFram/llama-68m",
+                          ("ibm-granite/granite-3b-code-instruct", [
-                "num_speculative_tokens": 5,
+                              "--speculative-model",
-                "speculative_draft_tensor_parallel_size": 1,
+                              "ibm-granite/granite-3b-code-instruct",
-            }),
+                              "--num_speculative-tokens",
-        ({
+                              "5",
-            "model": "ibm-granite/granite-3b-code-instruct",
+                              "--speculative-draft-tensor-parallel-size",
-        }, {
+                              "1",
-            "speculative_model":
+                          ])])
            "ibm-granite/granite-3b-code-instruct-accelerator",
            "num_speculative_tokens": 5,
            "speculative_draft_tensor_parallel_size": 1,
        })
    ])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1])
-def test_draft_model_tp_lt_target_model_tp2(test_llm_generator,
+def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
-                                            baseline_llm_generator,
+                                            per_test_common_llm_kwargs,
-                                            batch_size: int):
+                                            baseline_llm_kwargs,
                                            test_llm_kwargs, batch_size: int,
                                            seed: int):
    """Verify spec decode works well with smaller tp for draft models.
    """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test_tp(model,
-                                         test_llm_generator,
+                                     common_llm_kwargs,
-                                         batch_size,
+                                     per_test_common_llm_kwargs,
-                                         max_output_len=32,
+                                     baseline_llm_kwargs,
-                                         force_output_len=True)
+                                     test_llm_kwargs,
                                     batch_size,
                                     max_output_len=32,
                                     seed=seed,
                                     temperature=0.0)
--- a/tests/spec_decode/e2e/test_integration_dist_tp4.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
@@ -2,98 +2,97 @@
 tensor parallelism.
 """
 import openai
 import pytest
 import torch
-from .conftest import run_greedy_equality_correctness_test
+from .conftest import run_equality_correctness_test_tp
 MAIN_MODEL = "JackFram/llama-68m"
 SPEC_MODEL = "JackFram/llama-68m"
@pytest.mark.skipif(torch.cuda.device_count() < 4,
                    reason="Need at least 4 GPUs to run the test.")
@pytest.mark.parametrize(
    "common_llm_kwargs",
-    [{
+    [[
        # Use a small model for a fast test.
        # Note this is repeated in the test body; to initialize a tokenizer.
        "model": "JackFram/llama-68m",
        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
+        "--enforce_eager",
        # Required for spec decode.
-        "use_v2_block_manager": True,
+        "--use-v2-block-manager",
-        "tensor_parallel_size": 4,
+        "--tensor-parallel-size",
-
+        "4",
-        # Use AsyncLLM engine, so that the engine runs in its own process.
+    ]])
        # Otherwise, since vLLM does not follow true SPMD, the test runner
        # process will have both the engine and the rank0 worker. NCCL is not
        # cleaned up properly, and its server host thread leaks, causing the
        # second run of the test to fail with internal NCCL error.
        "use_async": True,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
-    {
+    [
-        "speculative_model": "JackFram/llama-68m",
+        "--speculative-model",
-        "num_speculative_tokens": 5,
+        f"{SPEC_MODEL}",
-    },
+        "--num-speculative-tokens",
        "5",
    ],
 ])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize(
    "test_llm_kwargs",
    [
        #TODO(wooyeon): add spec_draft_dp=2 case
-        {
+        [
-            "speculative_draft_tensor_parallel_size": 1,
+            "--speculative-draft-tensor-parallel-size",
-        },
+            "1",
        ],
    ])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1])
-def test_draft_model_tp_lt_target_model_tp4(test_llm_generator,
+def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
-                                            baseline_llm_generator,
+                                            per_test_common_llm_kwargs,
-                                            batch_size: int):
+                                            baseline_llm_kwargs,
                                            test_llm_kwargs, batch_size: int,
                                            seed: int):
    """Verify spec decode works well with smaller tp for draft models.
    """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test_tp(MAIN_MODEL,
-                                         test_llm_generator,
+                                     common_llm_kwargs,
-                                         batch_size,
+                                     per_test_common_llm_kwargs,
-                                         max_output_len=32,
+                                     baseline_llm_kwargs,
-                                         force_output_len=True)
+                                     test_llm_kwargs,
                                     batch_size,
                                     max_output_len=32,
                                     seed=seed,
                                     temperature=0.0)
@pytest.mark.skipif(torch.cuda.device_count() < 4,
                    reason="Need at least 4 GPUs to run the test.")
@pytest.mark.parametrize(
    "common_llm_kwargs",
-    [{
+    [[
        "model": "JackFram/llama-160m",
        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
+        "--enforce-eager",
        # Required for spec decode.
-        "use_v2_block_manager": True,
+        "--use-v2-block-manager",
-        "tensor_parallel_size": 4,
+        "--tensor-parallel-size",
-
+        "4",
-        # Use AsyncLLM engine, so that the engine runs in its own process.
+    ]])
-        # Otherwise, since vLLM does not follow true SPMD, the test runner
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
-        # process will have both the engine and the rank0 worker. NCCL is not
+@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
        # cleaned up properly, and its server host thread leaks, causing the
        # second run of the test to fail with internal NCCL error.
        "use_async": True,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize(
    "test_llm_kwargs",
    [
-        {
+        [
-            "speculative_model": "JackFram/llama-68m",
+            "--speculative-model",
-            "num_speculative_tokens": 5,
+            f"{SPEC_MODEL}",
            "--num-speculative-tokens",
            "5",
            # Artificially limit the draft model max model len; this forces vLLM
            # to skip speculation once the sequences grow beyond 32-k tokens.
-            "speculative_max_model_len": 32,
+            "--speculative-max-model-len",
-        },
+            "32",
        ],
    ])
@pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize(
@@ -105,8 +104,9 @@ def test_draft_model_tp_lt_target_model_tp4(test_llm_generator,
        64,
    ])
@pytest.mark.parametrize("seed", [1])
-def test_skip_speculation(baseline_llm_generator, test_llm_generator,
+def test_skip_speculation(common_llm_kwargs, per_test_common_llm_kwargs,
-                          batch_size: int, output_len: int):
+                          baseline_llm_kwargs, test_llm_kwargs,
                          batch_size: int, output_len: int, seed: int):
    """Verify job failure with RuntimeError when all sequences skip speculation.
    We do this by setting the max model len of the draft model to an
    artificially low value, such that when the sequences grow beyond it, they
@@ -114,9 +114,13 @@ def test_skip_speculation(baseline_llm_generator, test_llm_generator,
    TODO: fix it to pass without raising Error. (#5814)
    """
-    with pytest.raises(RuntimeError):
+    with pytest.raises(openai.APIConnectionError):
-        run_greedy_equality_correctness_test(baseline_llm_generator,
+        run_equality_correctness_test_tp(MAIN_MODEL,
-                                             test_llm_generator,
+                                         common_llm_kwargs,
-                                             batch_size,
+                                         per_test_common_llm_kwargs,
-                                             max_output_len=output_len,
+                                         baseline_llm_kwargs,
-                                             force_output_len=True)
+                                         test_llm_kwargs,
                                         batch_size,
                                         output_len,
                                         seed,
                                         temperature=0.0)
--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
@@ -1,24 +1,22 @@
 import math
 from itertools import cycle
 import pytest
 from vllm import SamplingParams
-from .conftest import get_logprobs_from_llm_generator
+from .conftest import run_logprob_correctness_test
@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "model": "JackFram/llama-68m",
+        "model_name": "JackFram/llama-68m",
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
        # Required for spec decode.
        "use_v2_block_manager": True,
        "max_logprobs": 6,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -36,64 +34,29 @@ from .conftest import get_logprobs_from_llm_generator
        7,
    ])
@pytest.mark.parametrize("seed", [1])
-def test_logprobs_equality(baseline_llm_generator, test_llm_generator,
+@pytest.mark.parametrize("logprobs", [1, 6])
-                           batch_size: int, output_len: int):
+def test_logprobs_equality(vllm_runner, common_llm_kwargs,
                           per_test_common_llm_kwargs, baseline_llm_kwargs,
                           test_llm_kwargs, batch_size: int, output_len: int,
                           seed: int, logprobs: int):
    """Verify output logprobs are equal with and without speculative decoding.
    """
-    run_greedy_logprobs_correctness_test(baseline_llm_generator,
+    run_logprob_correctness_test(vllm_runner,
-                                         test_llm_generator,
+                                 common_llm_kwargs,
-                                         batch_size,
+                                 per_test_common_llm_kwargs,
-                                         max_output_len=output_len,
+                                 baseline_llm_kwargs,
-                                         force_output_len=True)
+                                 test_llm_kwargs,
                                 batch_size,
                                 output_len,
                                 seed,
                                 temperature=0.0,
                                 logprobs=logprobs)
@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "model": "JackFram/llama-68m",
+        "model_name": "JackFram/llama-68m",
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
        # Required for spec decode.
        "use_v2_block_manager": True,
        "max_logprobs": 6,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs",
                         [{
                             "speculative_model": "JackFram/llama-160m",
                             "num_speculative_tokens": 3,
                             "disable_logprobs_during_spec_decoding": False,
                         }])
@pytest.mark.parametrize("batch_size", [1])
@pytest.mark.parametrize("num_logprobs", [6])
@pytest.mark.parametrize(
    "output_len",
    [
        # Use smaller output len for fast test.
        7,
    ])
@pytest.mark.parametrize("seed", [1])
 def test_diff_num_logprobs(baseline_llm_generator, test_llm_generator,
                           batch_size: int, output_len: int,
                           num_logprobs: int):
    """Verify output logprobs are equal with and without spec decode.
    This specifies a number of logprobs >1.
    """
    run_greedy_logprobs_correctness_test(baseline_llm_generator,
                                         test_llm_generator,
                                         batch_size,
                                         max_output_len=output_len,
                                         force_output_len=True,
                                         logprob_rank=num_logprobs)
@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
        "model": "JackFram/llama-68m",
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
@@ -121,21 +84,29 @@ def test_diff_num_logprobs(baseline_llm_generator, test_llm_generator,
        32,
    ])
@pytest.mark.parametrize("seed", [1])
-def test_logprobs_different_k(baseline_llm_generator, test_llm_generator,
+@pytest.mark.parametrize("logprobs", [1, 6])
-                              batch_size: int, output_len: int):
+def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
                              per_test_common_llm_kwargs, baseline_llm_kwargs,
                              test_llm_kwargs, batch_size: int,
                              output_len: int, seed: int, logprobs: int):
    """Veriy logprob greedy equality with different speculation lens.
    """
-    run_greedy_logprobs_correctness_test(baseline_llm_generator,
+    run_logprob_correctness_test(vllm_runner,
-                                         test_llm_generator,
+                                 common_llm_kwargs,
-                                         batch_size,
+                                 per_test_common_llm_kwargs,
-                                         max_output_len=output_len,
+                                 baseline_llm_kwargs,
-                                         force_output_len=True)
+                                 test_llm_kwargs,
                                 batch_size,
                                 output_len,
                                 seed,
                                 temperature=0.0,
                                 logprobs=logprobs)
@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "model": "JackFram/llama-68m",
+        "model_name": "JackFram/llama-68m",
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
@@ -164,22 +135,30 @@ def test_logprobs_different_k(baseline_llm_generator, test_llm_generator,
        32,
    ])
@pytest.mark.parametrize("seed", [1])
-def test_logprobs_when_skip_speculation(baseline_llm_generator,
+@pytest.mark.parametrize("logprobs", [1])
-                                        test_llm_generator, batch_size: int,
+def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
-                                        output_len: int):
+                                        per_test_common_llm_kwargs,
                                        baseline_llm_kwargs, test_llm_kwargs,
                                        batch_size: int, output_len: int,
                                        seed: int, logprobs: int):
    """Verify logprobs greedy equality when some sequences skip speculation.
    """
-    run_greedy_logprobs_correctness_test(baseline_llm_generator,
+    run_logprob_correctness_test(vllm_runner,
-                                         test_llm_generator,
+                                 common_llm_kwargs,
-                                         batch_size,
+                                 per_test_common_llm_kwargs,
-                                         max_output_len=output_len,
+                                 baseline_llm_kwargs,
-                                         force_output_len=True)
+                                 test_llm_kwargs,
                                 batch_size,
                                 output_len,
                                 seed,
                                 temperature=0.0,
                                 logprobs=logprobs)
@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "model": "JackFram/llama-68m",
+        "model_name": "JackFram/llama-68m",
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
@@ -203,19 +182,17 @@ def test_logprobs_when_skip_speculation(baseline_llm_generator,
        32,
    ])
@pytest.mark.parametrize("seed", [1])
-def test_logprobs_temp_1(baseline_llm_generator, test_llm_generator,
+@pytest.mark.parametrize("logprobs", [6])
-                         batch_size: int, output_len: int):
+def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
                         per_test_common_llm_kwargs, baseline_llm_kwargs,
                         test_llm_kwargs, batch_size: int, output_len: int,
                         seed: int, logprobs: int):
    """Verify at least one logprob result has num_logprobs+1, which tests the
    case where the sampled token is not in top-k logprobs.
    Ideally, this test should validate equality with non-spec by getting
    logprobs. This is left as future improvement.
    """
    batch_size = 8
    max_output_len = output_len
    force_output_len = True
    logprob_rank = 5
    temperature = 1.0
    prompts = [
@@ -231,129 +208,40 @@ def test_logprobs_temp_1(baseline_llm_generator, test_llm_generator,
    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
    # If the test requires that we generated max_output_len tokens, then set the
    # sampling params to ignore eos token.
    ignore_eos = force_output_len
    sampling_params = SamplingParams(
-        max_tokens=max_output_len,
+        max_tokens=output_len,
-        ignore_eos=ignore_eos,
+        ignore_eos=True,
        temperature=temperature,
-        logprobs=logprob_rank,
+        logprobs=logprobs,
    )
-    spec_batch_logprobs = get_logprobs_from_llm_generator(
+    sd_args = {
-        test_llm_generator, prompts, sampling_params)
+        **common_llm_kwargs,
        **per_test_common_llm_kwargs,
        **test_llm_kwargs,
    }
    with vllm_runner(**sd_args) as vllm_model:
        sd_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)
    num_returned_logprobs = [
-        len(logprob_dict) for seq_logprobs in spec_batch_logprobs
+        len(seq_logprobs) for seq_logprobs in sd_outputs[-1]
        for logprob_dict in seq_logprobs
    ]
    # Assert one of the returned logprobs has > num_logprobs (indicating the
    # sampled token is not in top-k).
-    assert any([
+    assert any(
-        num_returned > logprob_rank for num_returned in num_returned_logprobs
+        [num_returned > logprobs for num_returned in num_returned_logprobs])
    ])
 def run_greedy_logprobs_correctness_test(baseline_llm_generator,
                                         test_llm_generator,
                                         batch_size,
                                         max_output_len,
                                         force_output_len: bool,
                                         logprob_rank: int = 1):
    """Helper method that compares the logprobs outputs of both the baseline LLM
    and the test LLM. It asserts greedy equality of the logprobs when the
    temperature is zero.
    """
    temperature = 0.0
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
        "San Francisco is know for its",
        "Facebook was created in 2004 by",
        "Curious George is a",
        "Python 3.11 brings improvements to its",
    ]
    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
    # If the test requires that we generated max_output_len tokens, then set the
    # sampling params to ignore eos token.
    ignore_eos = force_output_len
    sampling_params = SamplingParams(
        max_tokens=max_output_len,
        ignore_eos=ignore_eos,
        temperature=temperature,
        logprobs=logprob_rank,
    )
    spec_batch_logprobs = get_logprobs_from_llm_generator(
        test_llm_generator, prompts, sampling_params)
    baseline_batch_logprobs = get_logprobs_from_llm_generator(
        baseline_llm_generator, prompts, sampling_params)
    assert len(baseline_batch_logprobs) == len(prompts)
    assert len(spec_batch_logprobs) == len(prompts)
    # For each sequence in the batch.
    for i, (baseline_logprobs, spec_logprobs) in enumerate(
            zip(baseline_batch_logprobs, spec_batch_logprobs)):
        assert len(spec_logprobs) == len(baseline_logprobs)
        # For each generated position of the sequence.
        for pos, (spec_pos_logprobs, baseline_pos_logprobs) in enumerate(
                zip(spec_logprobs, baseline_logprobs)):
            # Map rank to token/logprob in spec output.
            spec_rank_to_token_id = {
                value.rank: key
                for key, value in spec_pos_logprobs.items()
            }
            spec_rank_to_logprob = {
                value.rank: value.logprob
                for key, value in spec_pos_logprobs.items()
            }
            # Map rank to token/logprob in baseline output.
            baseline_rank_to_token_id = {
                value.rank: key
                for key, value in baseline_pos_logprobs.items()
            }
            baseline_rank_to_logprob = {
                value.rank: value.logprob
                for key, value in baseline_pos_logprobs.items()
            }
            # Assert set of ranks returned is equal.
            assert set(spec_rank_to_token_id.keys()) == set(
                baseline_rank_to_token_id.keys())
            # Assert each logprob/token id is correct, keyed by rank.
            for rank in sorted(set(spec_rank_to_token_id.keys())):
                assert spec_rank_to_token_id[
                    rank] == baseline_rank_to_token_id[rank], f"{rank}"
                assert math.isclose(
                    a=spec_rank_to_logprob[rank],
                    b=baseline_rank_to_logprob[rank],
                    abs_tol=1e-1,
                )
@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "model": "JackFram/llama-160m",
+        "model_name": "JackFram/llama-160m",
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
        # Required for spec decode.
        "use_v2_block_manager": True,
        "max_logprobs": 6,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -364,57 +252,28 @@ def run_greedy_logprobs_correctness_test(baseline_llm_generator,
                             "disable_logprobs_during_spec_decoding": True,
                         }])
@pytest.mark.parametrize("seed", [1])
-def test_logprobs_disabled(baseline_llm_generator, test_llm_generator):
+@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize(
    "output_len",
    [
        # Use smaller output len for fast test.
        32,
    ])
@pytest.mark.parametrize("logprobs", [0])
 def test_logprobs_disabled(vllm_runner, common_llm_kwargs,
                           per_test_common_llm_kwargs, baseline_llm_kwargs,
                           test_llm_kwargs, batch_size: int, output_len: int,
                           seed: int, logprobs: int):
    """Check the behavior when logprobs are disabled.
    Token choices should match with the base model.
    """
-    prompts = [
+    run_logprob_correctness_test(vllm_runner,
-        "Hello, my name is",
+                                 common_llm_kwargs,
-        "The president of the United States is",
+                                 per_test_common_llm_kwargs,
-        "The capital of France is",
+                                 baseline_llm_kwargs,
-        "The future of AI is",
+                                 test_llm_kwargs,
-        "San Francisco is know for its",
+                                 batch_size,
-        "Facebook was created in 2004 by",
+                                 output_len,
-        "Curious George is a",
+                                 seed,
-        "Python 3.11 brings improvements to its",
+                                 temperature=0.0,
-    ]
+                                 logprobs=logprobs)
    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(4))]
    sampling_params = SamplingParams(
        # Use smaller output len for fast test
        max_tokens=7,
        ignore_eos=True,
        temperature=0.0,
        logprobs=2,
    )
    spec_batch_logprobs = get_logprobs_from_llm_generator(
        test_llm_generator, prompts, sampling_params)
    baseline_batch_logprobs = get_logprobs_from_llm_generator(
        baseline_llm_generator, prompts, sampling_params)
    assert len(baseline_batch_logprobs) == len(prompts)
    assert len(spec_batch_logprobs) == len(prompts)
    # For each sequence in the batch.
    for _, (baseline_logprobs, spec_logprobs) in enumerate(
            zip(baseline_batch_logprobs, spec_batch_logprobs)):
        assert len(spec_logprobs) == len(baseline_logprobs)
        # For each generated position of the sequence.
        for _, (spec_pos_logprobs, baseline_pos_logprobs) in enumerate(
                zip(spec_logprobs, baseline_logprobs)):
            assert len(spec_pos_logprobs) == 1
            spec_top_token_id = list(spec_pos_logprobs)[0]
            spec_top_logprob = spec_pos_logprobs[spec_top_token_id]
            assert spec_top_logprob.logprob == 0.0
            assert spec_top_logprob.rank == -1
            # check that the chosen token matches the base model
            baseline_logprob = baseline_pos_logprobs[spec_top_token_id]
            assert baseline_logprob.rank == 1
            assert spec_top_logprob.decoded_token \
                == baseline_logprob.decoded_token
--- a/tests/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
@@ -21,7 +21,7 @@ correctess for the target model outputs.
 import pytest
-from .conftest import run_greedy_equality_correctness_test
+from .conftest import run_equality_correctness_test
 # main model
 # lmsys/vicuna-7b-v1.3 was to be used but it's causing
@@ -55,7 +55,7 @@ PRECISION = "float32"
        "dtype": PRECISION,
        # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -70,15 +70,21 @@ PRECISION = "float32"
 ])
@pytest.mark.parametrize("batch_size", [1, 32])
@pytest.mark.parametrize("seed", [1])
-def test_medusa_e2e_greedy_correctness(baseline_llm_generator,
+def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                       test_llm_generator, batch_size: int,
+                                       per_test_common_llm_kwargs,
-                                       output_len: int):
+                                       baseline_llm_kwargs, test_llm_kwargs,
                                       batch_size: int, output_len: int,
                                       seed: int):
    """Verify greedy equality with different batch size."""
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner,
-                                         test_llm_generator,
+                                  common_llm_kwargs,
-                                         batch_size,
+                                  per_test_common_llm_kwargs,
-                                         max_output_len=output_len,
+                                  baseline_llm_kwargs,
-                                         force_output_len=True)
+                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0)
@pytest.mark.parametrize(
@@ -96,7 +102,7 @@ def test_medusa_e2e_greedy_correctness(baseline_llm_generator,
        "dtype": PRECISION,
        # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -111,17 +117,21 @@ def test_medusa_e2e_greedy_correctness(baseline_llm_generator,
 ])
@pytest.mark.parametrize("batch_size", [1, 32])
@pytest.mark.parametrize("seed", [1])
-def test_medusa_e2e_greedy_correctness_cuda_graph(baseline_llm_generator,
+def test_medusa_e2e_greedy_correctness_cuda_graph(
-                                                  test_llm_generator,
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-                                                  batch_size: int,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-                                                  output_len: int):
+        seed: int):
    """Verify greedy equality with cuda graph enabled and different 
    batch sizes."""
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner,
-                                         test_llm_generator,
+                                  common_llm_kwargs,
-                                         batch_size,
+                                  per_test_common_llm_kwargs,
-                                         max_output_len=output_len,
+                                  baseline_llm_kwargs,
-                                         force_output_len=True)
+                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0)
@pytest.mark.parametrize(
@@ -142,7 +152,7 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(baseline_llm_generator,
        "dtype": PRECISION,
        # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -160,18 +170,22 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(baseline_llm_generator,
    ])
@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize("seed", [1])
-def test_medusa_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
+def test_medusa_e2e_greedy_correctness_with_preemption(
-                                                       test_llm_generator,
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-                                                       batch_size: int,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-                                                       output_len: int):
+        seed: int):
    """Verify greedy equality, even when some sequences are preempted mid-
    generation.
    """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner,
-                                         test_llm_generator,
+                                  common_llm_kwargs,
-                                         batch_size,
+                                  per_test_common_llm_kwargs,
-                                         max_output_len=output_len,
+                                  baseline_llm_kwargs,
-                                         force_output_len=True)
+                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0)
@pytest.mark.parametrize(
@@ -187,7 +201,7 @@ def test_medusa_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
        "dtype": PRECISION,
        # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -209,16 +223,22 @@ def test_medusa_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
        32,
    ])
@pytest.mark.parametrize("seed", [1])
-def test_medusa_different_k(baseline_llm_generator, test_llm_generator,
+def test_medusa_different_k(vllm_runner, common_llm_kwargs,
-                            batch_size: int, output_len: int):
+                            per_test_common_llm_kwargs, baseline_llm_kwargs,
                            test_llm_kwargs, batch_size: int, output_len: int,
                            seed: int):
    """Verify that medusa speculative decoding produces exact equality
    to without spec decode with different values of num_speculative_tokens.
    """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner,
-                                         test_llm_generator,
+                                  common_llm_kwargs,
-                                         batch_size,
+                                  per_test_common_llm_kwargs,
-                                         max_output_len=output_len,
+                                  baseline_llm_kwargs,
-                                         force_output_len=True)
+                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0)
@pytest.mark.parametrize(
@@ -234,7 +254,7 @@ def test_medusa_different_k(baseline_llm_generator, test_llm_generator,
        "dtype": PRECISION,
        # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -252,17 +272,23 @@ def test_medusa_different_k(baseline_llm_generator, test_llm_generator,
        32,
    ])
@pytest.mark.parametrize("seed", [1])
-def test_medusa_disable_queue(baseline_llm_generator, test_llm_generator,
+def test_medusa_disable_queue(vllm_runner, common_llm_kwargs,
-                              batch_size: int, output_len: int):
+                              per_test_common_llm_kwargs, baseline_llm_kwargs,
                              test_llm_kwargs, batch_size: int,
                              output_len: int, seed: int):
    """Verify that medusa speculative decoding produces exact equality
    to without spec decode when speculation is disabled for large
    batch sizes.
    """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner,
-                                         test_llm_generator,
+                                  common_llm_kwargs,
-                                         batch_size,
+                                  per_test_common_llm_kwargs,
-                                         max_output_len=output_len,
+                                  baseline_llm_kwargs,
-                                         force_output_len=True)
+                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0)
 if __name__ == "__main__":
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -25,8 +25,7 @@ import pytest
 from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size
-from .conftest import (run_equality_correctness_test,
+from .conftest import run_equality_correctness_test
                       run_greedy_equality_correctness_test)
 # main model
 MAIN_MODEL = "JackFram/llama-160m"
@@ -58,7 +57,7 @@ PRECISION = "float32"
        "dtype": PRECISION,
        # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -72,14 +71,21 @@ PRECISION = "float32"
 ])
@pytest.mark.parametrize("batch_size", [1, 32])
@pytest.mark.parametrize("seed", [1])
-def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
+def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                    batch_size: int, output_len: int):
+                                    per_test_common_llm_kwargs,
                                    baseline_llm_kwargs, test_llm_kwargs,
                                    batch_size: int, output_len: int,
                                    seed: int):
    """Verify greedy equality with different batch size."""
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner,
-                                         test_llm_generator,
+                                  common_llm_kwargs,
-                                         batch_size,
+                                  per_test_common_llm_kwargs,
-                                         max_output_len=output_len,
+                                  baseline_llm_kwargs,
-                                         force_output_len=True)
+                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0)
@pytest.mark.parametrize(
@@ -98,7 +104,7 @@ def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
        "dtype": PRECISION,
        # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -110,17 +116,21 @@ def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
@pytest.mark.parametrize("output_len", [2048])
@pytest.mark.parametrize("batch_size", [1, 32])
@pytest.mark.parametrize("seed", [1])
-def test_mlp_e2e_acceptance_rate(baseline_llm_generator, test_llm_generator,
+def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
-                                 batch_size: int, output_len: int):
+                                 per_test_common_llm_kwargs,
                                 baseline_llm_kwargs, test_llm_kwargs,
                                 batch_size: int, output_len: int, seed: int):
    """Verify acceptance rate with different batch size and large output 
    length."""
-    run_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner,
-                                  test_llm_generator,
+                                  common_llm_kwargs,
                                  per_test_common_llm_kwargs,
                                  baseline_llm_kwargs,
                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  temperature=0.0,
-                                  seeded=True,
+                                  seed=seed,
                                  force_output_len=True,
                                  expected_acceptance_rate=0.48)
@@ -140,7 +150,7 @@ def test_mlp_e2e_acceptance_rate(baseline_llm_generator, test_llm_generator,
        "dtype": PRECISION,
        # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
        # Speculative model
        "speculative_model": SPEC_MODEL,
@@ -151,28 +161,35 @@ def test_mlp_e2e_acceptance_rate(baseline_llm_generator, test_llm_generator,
@pytest.mark.parametrize("output_len", [64])
@pytest.mark.parametrize("batch_size", [1, 32])
@pytest.mark.parametrize("temperature", [0.1, 1.0])
-@pytest.mark.parametrize("seed", [None])
+@pytest.mark.parametrize("seed", [1])
-def test_mlp_e2e_seeded_correctness(baseline_llm_generator, test_llm_generator,
+def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
                                    per_test_common_llm_kwargs,
                                    baseline_llm_kwargs, test_llm_kwargs,
                                    batch_size: int, output_len: int,
-                                    temperature: float):
+                                    temperature: float, seed: int):
    """Verify seeded runs produce the same output."""
-    run_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner,
-                                  test_llm_generator,
+                                  common_llm_kwargs,
                                  per_test_common_llm_kwargs,
                                  baseline_llm_kwargs,
                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  temperature=temperature,
-                                  seeded=True,
+                                  seed=seed)
                                  force_output_len=True)
    # Ensure this same test does fail if we _don't_ include per-request seeds
    with pytest.raises(AssertionError):
-        run_equality_correctness_test(baseline_llm_generator,
+        run_equality_correctness_test(vllm_runner,
-                                      test_llm_generator,
+                                      common_llm_kwargs,
                                      per_test_common_llm_kwargs,
                                      baseline_llm_kwargs,
                                      test_llm_kwargs,
                                      batch_size,
                                      max_output_len=output_len,
                                      temperature=temperature,
-                                      seeded=False,
+                                      seed=seed,
-                                      force_output_len=True)
+                                      disable_seed=True)
@pytest.mark.parametrize(
@@ -193,7 +210,7 @@ def test_mlp_e2e_seeded_correctness(baseline_llm_generator, test_llm_generator,
        "dtype": PRECISION,
        # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -210,18 +227,22 @@ def test_mlp_e2e_seeded_correctness(baseline_llm_generator, test_llm_generator,
    ])
@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize("seed", [1])
-def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
+def test_mlp_e2e_greedy_correctness_with_preemption(
-                                                    test_llm_generator,
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-                                                    batch_size: int,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-                                                    output_len: int):
+        seed: int):
    """Verify greedy equality, even when some sequences are preempted mid-
    generation.
    """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner,
-                                         test_llm_generator,
+                                  common_llm_kwargs,
-                                         batch_size,
+                                  per_test_common_llm_kwargs,
-                                         max_output_len=output_len,
+                                  baseline_llm_kwargs,
-                                         force_output_len=True)
+                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0)
@pytest.mark.parametrize(
@@ -242,7 +263,7 @@ def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
        "dtype": PRECISION,
        # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -259,10 +280,10 @@ def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
    ])
@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize("seed", [1])
-def test_mlp_e2e_greedy_correctness_with_padding(baseline_llm_generator,
+def test_mlp_e2e_greedy_correctness_with_padding(
-                                                 test_llm_generator,
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-                                                 batch_size: int,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-                                                 output_len: int):
+        seed: int):
    """Verify greedy equality when the vocab dimension is padded
    """
@@ -273,11 +294,15 @@ def test_mlp_e2e_greedy_correctness_with_padding(baseline_llm_generator,
    with patch(
            "vllm.model_executor.layers.vocab_parallel_embedding.pad_vocab_size",
            patched_pad_vocab_size):
-        run_greedy_equality_correctness_test(baseline_llm_generator,
+        run_equality_correctness_test(vllm_runner,
-                                             test_llm_generator,
+                                      common_llm_kwargs,
-                                             batch_size,
+                                      per_test_common_llm_kwargs,
-                                             max_output_len=output_len,
+                                      baseline_llm_kwargs,
-                                             force_output_len=True)
+                                      test_llm_kwargs,
                                      batch_size,
                                      max_output_len=output_len,
                                      seed=seed,
                                      temperature=0.0)
@pytest.mark.parametrize(
@@ -293,7 +318,7 @@ def test_mlp_e2e_greedy_correctness_with_padding(baseline_llm_generator,
        "dtype": PRECISION,
        # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -315,16 +340,22 @@ def test_mlp_e2e_greedy_correctness_with_padding(baseline_llm_generator,
        32,
    ])
@pytest.mark.parametrize("seed", [1])
-def test_mlp_different_k(baseline_llm_generator, test_llm_generator,
+def test_mlp_different_k(vllm_runner, common_llm_kwargs,
-                         batch_size: int, output_len: int):
+                         per_test_common_llm_kwargs, baseline_llm_kwargs,
                         test_llm_kwargs, batch_size: int, seed: int,
                         output_len: int):
    """Verify that mlp speculative decoding produces exact equality
    to without spec decode with different values of num_speculative_tokens.
    """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner,
-                                         test_llm_generator,
+                                  common_llm_kwargs,
-                                         batch_size,
+                                  per_test_common_llm_kwargs,
-                                         max_output_len=output_len,
+                                  baseline_llm_kwargs,
-                                         force_output_len=True)
+                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0)
@pytest.mark.parametrize(
@@ -340,7 +371,7 @@ def test_mlp_different_k(baseline_llm_generator, test_llm_generator,
        "dtype": PRECISION,
        # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -357,14 +388,20 @@ def test_mlp_different_k(baseline_llm_generator, test_llm_generator,
        32,
    ])
@pytest.mark.parametrize("seed", [1])
-def test_mlp_disable_queue(baseline_llm_generator, test_llm_generator,
+def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
-                           batch_size: int, output_len: int):
+                           per_test_common_llm_kwargs, baseline_llm_kwargs,
                           test_llm_kwargs, batch_size: int, seed: int,
                           output_len: int):
    """Verify that mlp speculative decoding produces exact equality
    to without spec decode when speculation is disabled for large
    batch sizes.
    """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner,
-                                         test_llm_generator,
+                                  common_llm_kwargs,
-                                         batch_size,
+                                  per_test_common_llm_kwargs,
-                                         max_output_len=output_len,
+                                  baseline_llm_kwargs,
-                                         force_output_len=True)
+                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0)
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -41,8 +41,9 @@ from transformers import AutoTokenizer
 from vllm import SamplingParams
 from ...utils import fork_new_process_for_each_test
 from .conftest import (get_output_from_llm_generator,
-                       run_greedy_equality_correctness_test)
+                       run_equality_correctness_test)
@pytest.mark.parametrize(
@@ -73,6 +74,7 @@ from .conftest import (get_output_from_llm_generator,
@pytest.mark.parametrize("test_llm_kwargs", [{}])
@pytest.mark.parametrize("batch_size", [1, 32])
@pytest.mark.parametrize("seed", [1])
@fork_new_process_for_each_test
 def test_spec_decode_e2e_with_detokenization(test_llm_generator,
                                             batch_size: int):
    """Run generation with speculative decoding on a batch. Verify the engine
@@ -116,44 +118,6 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
        assert actual_tokens.strip() == expected_tokens.strip()
@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
        # Use a small model for a fast test.
        # Note this is repeated in the test body; to initialize a tokenizer.
        "model": "JackFram/llama-68m",
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
        # Required for spec decode.
        "use_v2_block_manager": True,
        # Use AsyncLLM engine
        "use_async": True,
    }])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
    {
        "speculative_model": "JackFram/llama-68m",
        "num_speculative_tokens": 5,
    },
 ])
@pytest.mark.parametrize("test_llm_kwargs", [{}])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1])
 def test_spec_decode_e2e_with_async_engine(test_llm_generator,
                                           baseline_llm_generator,
                                           batch_size: int):
    """Verify spec decode works well with async LLM engine.
    """
    run_greedy_equality_correctness_test(baseline_llm_generator,
                                         test_llm_generator,
                                         batch_size,
                                         max_output_len=32,
                                         force_output_len=True)
@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
@@ -172,10 +136,10 @@ def test_spec_decode_e2e_with_async_engine(test_llm_generator,
        # Try two different tiny base models.
        # Note that one is equal to the draft model, another isn't.
        {
-            "model": "JackFram/llama-68m",
+            "model_name": "JackFram/llama-68m",
        },
        {
-            "model": "JackFram/llama-160m",
+            "model_name": "JackFram/llama-160m",
        },
    ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -189,13 +153,15 @@ def test_spec_decode_e2e_with_async_engine(test_llm_generator,
    "output_len",
    [
        # Use long output len for the small model test.
-        1536,
+        10,
    ])
@pytest.mark.parametrize("batch_size", [1])
@pytest.mark.parametrize("seed", [1])
@fork_new_process_for_each_test
 def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
-        baseline_llm_generator, test_llm_generator, batch_size: int,
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        output_len: int):
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
        seed: int):
    """Verify greedy equality on a tiny model with batch size of one.
    Since this test is cheaper than other e2e correctness tests, we generate
@@ -204,14 +170,18 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
    When the draft model is the same as the target model, we further check
    whether all speculative tokens are accepted.
    """
-    ensure_all_accepted = test_llm_generator.same_draft_target_model
+    ensure_all_accepted = per_test_common_llm_kwargs.get(
-    run_greedy_equality_correctness_test(
+        "model_name") == test_llm_kwargs.get("speculative_model")
-        baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner,
-        test_llm_generator,
+                                  common_llm_kwargs,
-        batch_size,
+                                  per_test_common_llm_kwargs,
-        max_output_len=output_len,
+                                  baseline_llm_kwargs,
-        force_output_len=True,
+                                  test_llm_kwargs,
-        ensure_all_accepted=ensure_all_accepted)
+                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0,
                                  ensure_all_accepted=ensure_all_accepted)
@pytest.mark.parametrize(
@@ -232,10 +202,10 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
        # Try two different tiny base models.
        # Note that one is equal to the draft model, another isn't.
        {
-            "model": "JackFram/llama-68m",
+            "model_name": "JackFram/llama-68m",
        },
        {
-            "model": "JackFram/llama-160m",
+            "model_name": "JackFram/llama-160m",
        },
    ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -253,16 +223,22 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
    ])
@pytest.mark.parametrize("batch_size", [64])
@pytest.mark.parametrize("seed", [1])
@fork_new_process_for_each_test
 def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
-        baseline_llm_generator, test_llm_generator, batch_size: int,
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        output_len: int):
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
        seed: int):
    """Verify greedy equality on a tiny model and large batch size.
    """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner,
-                                         test_llm_generator,
+                                  common_llm_kwargs,
-                                         batch_size,
+                                  per_test_common_llm_kwargs,
-                                         max_output_len=output_len,
+                                  baseline_llm_kwargs,
-                                         force_output_len=True)
+                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0)
@pytest.mark.parametrize(
@@ -280,10 +256,10 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
        # Try two different tiny base models.
        # Note that one is equal to the draft model, another isn't.
        {
-            "model": "JackFram/llama-68m",
+            "model_name": "JackFram/llama-68m",
        },
        {
-            "model": "JackFram/llama-160m",
+            "model_name": "JackFram/llama-160m",
        },
    ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -298,24 +274,31 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
 ])
@pytest.mark.parametrize("batch_size", [32])
@pytest.mark.parametrize("seed", [1])
@fork_new_process_for_each_test
 def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
-        baseline_llm_generator, test_llm_generator, batch_size: int,
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        max_output_len: int):
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
        max_output_len: int, seed: int):
    """Verify greedy equality on a tiny model, with a large batch size, and when
    sampling respects the EOS token.
    """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner,
-                                         test_llm_generator,
+                                  common_llm_kwargs,
-                                         batch_size,
+                                  per_test_common_llm_kwargs,
-                                         max_output_len,
+                                  baseline_llm_kwargs,
-                                         force_output_len=False)
+                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len,
                                  seed=seed,
                                  temperature=0.0,
                                  ignore_eos=False)
@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
        # A "real" model (not tiny).
-        "model": "meta-llama/Llama-2-7b-chat-hf",
+        "model_name": "meta-llama/Llama-2-7b-chat-hf",
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
@@ -342,24 +325,30 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
        256,
    ])
@pytest.mark.parametrize("seed", [1])
@fork_new_process_for_each_test
 def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
-        baseline_llm_generator, test_llm_generator, batch_size: int,
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        output_len: int):
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
        seed: int):
    """Verify greedy equality on a "real" model and batch size of 1. This is
    separate from large BS tests to make identifying the source of bugs easier.
    """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner,
-                                         test_llm_generator,
+                                  common_llm_kwargs,
-                                         batch_size,
+                                  per_test_common_llm_kwargs,
-                                         max_output_len=output_len,
+                                  baseline_llm_kwargs,
-                                         force_output_len=True)
+                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0)
@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
        # A "real" model (not tiny).
-        "model": "meta-llama/Llama-2-7b-chat-hf",
+        "model_name": "meta-llama/Llama-2-7b-chat-hf",
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
@@ -386,17 +375,23 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
        64,
    ])
@pytest.mark.parametrize("seed", [1])
@fork_new_process_for_each_test
 def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
-        baseline_llm_generator, test_llm_generator, batch_size: int,
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        output_len: int):
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
        seed: int):
    """Verify greedy equality with a "real" model on a nontrivial batch size.
    This is the closest test to a real production workload.
    """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner,
-                                         test_llm_generator,
+                                  common_llm_kwargs,
-                                         batch_size,
+                                  per_test_common_llm_kwargs,
-                                         max_output_len=output_len,
+                                  baseline_llm_kwargs,
-                                         force_output_len=True)
+                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0)
@pytest.mark.parametrize(
@@ -415,7 +410,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
    {
-        "model": "JackFram/llama-160m",
+        "model_name": "JackFram/llama-160m",
    },
 ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -433,23 +428,29 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
    ])
@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize("seed", [1])
@fork_new_process_for_each_test
 def test_spec_decode_e2e_greedy_correctness_with_preemption(
-        baseline_llm_generator, test_llm_generator, batch_size: int,
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        output_len: int):
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
        seed: int):
    """Verify greedy equality, even when some sequences are preempted mid-
    generation.
    """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner,
-                                         test_llm_generator,
+                                  common_llm_kwargs,
-                                         batch_size,
+                                  per_test_common_llm_kwargs,
-                                         max_output_len=output_len,
+                                  baseline_llm_kwargs,
-                                         force_output_len=True)
+                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0)
@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "model": "JackFram/llama-160m",
+        "model_name": "JackFram/llama-160m",
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
@@ -487,22 +488,29 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
        32,
    ])
@pytest.mark.parametrize("seed", [1])
-def test_spec_decode_different_block_size(baseline_llm_generator,
+@fork_new_process_for_each_test
-                                          test_llm_generator, batch_size: int,
+def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
-                                          output_len: int):
+                                          per_test_common_llm_kwargs,
                                          baseline_llm_kwargs, test_llm_kwargs,
                                          batch_size: int, output_len: int,
                                          seed: int):
    """Verify greedy equality over different block sizes.
    """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner,
-                                         test_llm_generator,
+                                  common_llm_kwargs,
-                                         batch_size,
+                                  per_test_common_llm_kwargs,
-                                         max_output_len=output_len,
+                                  baseline_llm_kwargs,
-                                         force_output_len=True)
+                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0)
@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "model": "JackFram/llama-160m",
+        "model_name": "JackFram/llama-160m",
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
@@ -534,24 +542,31 @@ def test_spec_decode_different_block_size(baseline_llm_generator,
        64,
    ])
@pytest.mark.parametrize("seed", [1])
-def test_skip_speculation(baseline_llm_generator, test_llm_generator,
+@fork_new_process_for_each_test
-                          batch_size: int, output_len: int):
+def test_skip_speculation(vllm_runner, common_llm_kwargs,
                          per_test_common_llm_kwargs, baseline_llm_kwargs,
                          test_llm_kwargs, batch_size: int, output_len: int,
                          seed: int):
    """Verify greedy equality when some (or all) sequences skip speculation.
    We do this by setting the max model len of the draft model to an
    artificially low value, such that when the sequences grow beyond it, they
    are skipped in speculative decoding.
    """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner,
-                                         test_llm_generator,
+                                  common_llm_kwargs,
-                                         batch_size,
+                                  per_test_common_llm_kwargs,
-                                         max_output_len=output_len,
+                                  baseline_llm_kwargs,
-                                         force_output_len=True)
+                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0)
@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "model": "JackFram/llama-160m",
+        "model_name": "JackFram/llama-160m",
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
@@ -571,21 +586,28 @@ def test_skip_speculation(baseline_llm_generator, test_llm_generator,
@pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize("output_len", [10])
@pytest.mark.parametrize("seed", [1])
-def test_disable_speculation(baseline_llm_generator, test_llm_generator,
+@fork_new_process_for_each_test
-                             batch_size: int, output_len: int):
+def test_disable_speculation(vllm_runner, common_llm_kwargs,
                             per_test_common_llm_kwargs, baseline_llm_kwargs,
                             test_llm_kwargs, batch_size: int, output_len: int,
                             seed: int):
    """Verify greedy equality when all sequences disable speculation.
    """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner,
-                                         test_llm_generator,
+                                  common_llm_kwargs,
-                                         batch_size,
+                                  per_test_common_llm_kwargs,
-                                         max_output_len=output_len,
+                                  baseline_llm_kwargs,
-                                         force_output_len=True)
+                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0)
@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "model": "JackFram/llama-68m",
+        "model_name": "JackFram/llama-68m",
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
@@ -613,22 +635,28 @@ def test_disable_speculation(baseline_llm_generator, test_llm_generator,
        32,
    ])
@pytest.mark.parametrize("seed", [1])
-def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int,
+@fork_new_process_for_each_test
-                output_len: int):
+def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
                baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
                output_len: int, seed: int):
    """Verify that speculative decoding produces exact equality to without spec
    decode with many different values of k.
    """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner,
-                                         test_llm_generator,
+                                  common_llm_kwargs,
-                                         batch_size,
+                                  per_test_common_llm_kwargs,
-                                         max_output_len=output_len,
+                                  baseline_llm_kwargs,
-                                         force_output_len=True)
+                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0)
@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "model": "JackFram/llama-160m",
+        "model_name": "JackFram/llama-160m",
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
@@ -657,15 +685,22 @@ def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int,
        32,
    ])
@pytest.mark.parametrize("seed", [1])
-def test_typical_acceptance_sampling(baseline_llm_generator,
+@fork_new_process_for_each_test
-                                     test_llm_generator, batch_size: int,
+def test_typical_acceptance_sampling(vllm_runner, common_llm_kwargs,
-                                     output_len: int):
+                                     per_test_common_llm_kwargs,
                                     baseline_llm_kwargs, test_llm_kwargs,
                                     batch_size: int, output_len: int,
                                     seed: int):
    """Verify that speculative decoding produces exact equality to without spec
    decode with TypicalAcceptanceSampler as the draft token acceptance
    sampling method.
    """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
+    run_equality_correctness_test(vllm_runner,
-                                         test_llm_generator,
+                                  common_llm_kwargs,
-                                         batch_size,
+                                  per_test_common_llm_kwargs,
-                                         max_output_len=output_len,
+                                  baseline_llm_kwargs,
-                                         force_output_len=True)
+                                  test_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
                                  temperature=0.0)
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Simon Mo	3fd2b0d21c	Bump version to v0.6.1 (#8379 ) Some checks failed Create Release / Create Release (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.10, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.11, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.12, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.8, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.9, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.10, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.11, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.12, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.8, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.9, 2.4.0) (push) Has been cancelled Details	2024-09-11 14:42:11 -07:00
Patrick von Platen	d394787e52	Pixtral (#8377 ) Co-authored-by: Roger Wang <ywang@roblox.com>	2024-09-11 14:41:55 -07:00
Lily Liu	775f00f81e	[Speculative Decoding] Test refactor (#8317 ) Co-authored-by: youkaichao <youkaichao@126.com>	2024-09-11 14:07:34 -07:00
Aarni Koskela	8baa454937	[Misc] Move device options to a single place (#8322 )	2024-09-11 13:25:58 -07:00
bnellnm	73202dbe77	[Kernel][Misc] register ops to prevent graph breaks (#6917 ) Co-authored-by: Sage Moore <sage@neuralmagic.com>	2024-09-11 12:52:19 -07:00
Cyrus Leung	7015417fd4	[Bugfix] Add missing attributes in mistral tokenizer (#8364 )	2024-09-11 11:36:54 -07:00
Alexey Kondratiev(AMD)	aea02f30de	[CI/Build] Excluding test_moe.py from AMD Kernels tests for investigation (#8373 )	2024-09-11 18:31:41 +00:00
Li, Jiang	0b952af458	[Hardware][Intel] Support compressed-tensor W8A8 for CPU backend (#7257 )	2024-09-11 09:46:46 -07:00
Yang Fan	3b7fea770f	[Model][VLM] Add Qwen2-VL model support (#7905 ) Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>	2024-09-11 09:31:19 -07:00
Pooya Davoodi	cea95dfb94	[Frontend] Create ErrorResponse instead of raising exceptions in run_batch (#8347 )	2024-09-11 05:30:11 +00:00
Yangshen⚡Deng	6a512a00df	[model] Support for Llava-Next-Video model (#7559 ) Co-authored-by: Roger Wang <ywang@roblox.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>	2024-09-10 22:21:36 -07:00
Pavani Majety	efcf946a15	[Hardware][NV] Add support for ModelOpt static scaling checkpoints. (#6112 )	2024-09-11 00:38:40 -04:00
Isotr0py	1230263e16	[Bugfix] Fix InternVL2 vision embeddings process with pipeline parallel (#8299 )	2024-09-11 10:11:01 +08:00
Jee Jee Li	e497b8aeff	[Misc] Skip loading extra bias for Qwen2-MOE GPTQ models (#8329 )	2024-09-10 20:59:19 -04:00
Tyler Michael Smith	94144e726c	[CI/Build][Kernel] Update CUTLASS to 3.5.1 tag (#8043 )	2024-09-10 23:51:58 +00:00
William Lin	1d5e397aa4	[Core/Bugfix] pass VLLM_ATTENTION_BACKEND to ray workers (#8172 )	2024-09-10 23:46:08 +00:00
Alexander Matveev	22f3a4bc6c	[Bugfix] lookahead block table with cuda graph max capture (#8340 ) [Bugfix] Ensure multistep lookahead allocation is compatible with cuda graph max capture (#8340)	2024-09-10 16:00:35 -07:00
Cody Yu	b1f3e18958	[MISC] Keep chunked prefill enabled by default with long context when prefix caching is enabled (#8342 )	2024-09-10 22:28:28 +00:00
Prashant Gupta	04e7c4e771	[Misc] remove peft as dependency for prompt models (#8162 )	2024-09-10 17:21:56 -04:00
Kevin Lin	5faedf1b62	[Spec Decode] Move ops.advance_step to flash attn advance_step (#8224 )	2024-09-10 13:18:14 -07:00
sumitd2	02751a7a42	Fix ppc64le buildkite job (#8309 )	2024-09-10 12:58:34 -07:00
Alexey Kondratiev(AMD)	f421f3cefb	[CI/Build] Enabling kernels tests for AMD, ignoring some of then that fail (#8130 )	2024-09-10 11:51:15 -07:00
Cyrus Leung	8c054b7a62	[Frontend] Clean up type annotations for mistral tokenizer (#8314 )	2024-09-10 16:49:11 +00:00
Daniele	6234385f4a	[CI/Build] enable ccache/scccache for HIP builds (#8327 )	2024-09-10 08:55:08 -07:00
Cyrus Leung	da1a844e61	[Bugfix] Fix missing `post_layernorm` in CLIP (#8155 )	2024-09-10 08:22:50 +00:00
Simon Mo	a1d874224d	Add NVIDIA Meetup slides, announce AMD meetup, and add contact info (#8319 )	2024-09-09 23:21:00 -07:00
Dipika Sikka	6cd5e5b07e	[Misc] Fused MoE Marlin support for GPTQ (#8217 )	2024-09-09 23:02:52 -04:00
Kyle Sayers	c7cb5c3335	[Misc] GPTQ Activation Ordering (#8135 )	2024-09-09 16:27:26 -04:00
Vladislav Kruglikov	f9b4a2d415	[Bugfix] Correct adapter usage for cohere and jamba (#8292 )	2024-09-09 11:20:46 -07:00
Adam Lugowski	58fcc8545a	[Frontend] Add progress reporting to run_batch.py (#8060 ) Co-authored-by: Adam Lugowski <adam.lugowski@parasail.io>	2024-09-09 11:16:37 -07:00
Kyle Mistele	08287ef675	[Bugfix] Streamed tool calls now more strictly follow OpenAI's format; ensures Vercel AI SDK compatibility (#8272 )	2024-09-09 10:45:11 -04:00
Alexander Matveev	4ef41b8476	[Bugfix] Fix async postprocessor in case of preemption (#8267 )	2024-09-07 21:01:51 -07:00
Joe Runde	cfe712bf1a	[CI/Build] Use python 3.12 in cuda image (#8133 ) Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>	2024-09-07 13:03:16 -07:00
sumitd2	b962ee1470	ppc64le: Dockerfile fixed, and a script for buildkite (#8026 )	2024-09-07 11:18:40 -07:00
Isotr0py	36bf8150cc	[Model][VLM] Decouple weight loading logic for `Paligemma` (#8269 )	2024-09-07 17:45:44 +00:00
Isotr0py	e807125936	[Model][VLM] Support multi-images inputs for InternVL2 models (#8201 )	2024-09-07 16:38:23 +08:00
Cyrus Leung	9f68e00d27	[Bugfix] Fix broken OpenAI tensorizer test (#8258 )	2024-09-07 08:02:39 +00:00
youkaichao	ce2702a923	[tpu][misc] fix typo (#8260 )	2024-09-06 22:40:46 -07:00
Wei-Sheng Chin	795b662cff	Enable Random Prefix Caching in Serving Profiling Tool (benchmark_serving.py) (#8241 )	2024-09-06 20:18:16 -07:00
Cyrus Leung	2f707fcb35	[Model] Multi-input support for LLaVA (#8238 )	2024-09-07 02:57:24 +00:00
Kyle Mistele	41e95c5247	[Bugfix] Fix Hermes tool call chat template bug (#8256 ) Co-authored-by: Kyle Mistele <kyle@constellate.ai>	2024-09-07 10:49:01 +08:00
William Lin	12dd715807	[misc] [doc] [frontend] LLM torch profiler support (#7943 )	2024-09-06 17:48:48 -07:00
Patrick von Platen	29f49cd6e3	[Model] Allow loading from original Mistral format (#8168 ) Co-authored-by: Michael Goin <michael@neuralmagic.com>	2024-09-06 17:02:05 -06:00
Dipika Sikka	23f322297f	[Misc] Remove `SqueezeLLM` (#8220 )	2024-09-06 16:29:03 -06:00
rasmith	9db52eab3d	[Kernel] [Triton] Memory optimization for awq_gemm and awq_dequantize, 2x throughput (#8248 )	2024-09-06 16:26:09 -06:00
Alexey Kondratiev(AMD)	1447c97e75	[CI/Build] Increasing timeout for multiproc worker tests (#8203 )	2024-09-06 11:51:03 -07:00
Rui Qiao	de80783b69	[Misc] Use ray[adag] dependency instead of cuda (#7938 )	2024-09-06 09:18:35 -07:00
afeldman-nm	e5cab71531	[Frontend] Add --logprobs argument to `benchmark_serving.py` (#8191 )	2024-09-06 09:01:14 -07:00
Nick Hill	baa5467547	[BugFix] Fix Granite model configuration (#8216 )	2024-09-06 11:39:29 +08:00
Jiaxin Shan	db3bf7c991	[Core] Support load and unload LoRA in api server (#6566 ) Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>	2024-09-05 18:10:33 -07:00
sroy745	2febcf2777	[Documentation][Spec Decode] Add documentation about lossless guarantees in Speculative Decoding in vLLM (#7962 )	2024-09-05 16:25:29 -04:00
Michael Goin	2ee45281a5	Move verify_marlin_supported to GPTQMarlinLinearMethod (#8165 )	2024-09-05 11:09:46 -04:00
Alex Brooks	9da25a88aa	[MODEL] Qwen Multimodal Support (Qwen-VL / Qwen-VL-Chat) (#8029 ) Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>	2024-09-05 12:48:10 +00:00
manikandan.tm@zucisystems.com	8685ba1a1e	Inclusion of InternVLChatModel In PP_SUPPORTED_MODELS(Pipeline Parallelism) (#7860 )	2024-09-05 11:33:37 +00:00
Cyrus Leung	288a938872	[Doc] Indicate more information about supported modalities (#8181 )	2024-09-05 10:51:53 +00:00
Elfie Guo	e39ebf5cf5	[Core/Bugfix] Add query dtype as per FlashInfer API requirements. (#8173 )	2024-09-05 05:12:26 +00:00
Kevin H. Luu	ba262c4e5a	[ci] Mark LoRA test as soft-fail (#8160 ) Signed-off-by: kevin <kevin@anyscale.com>	2024-09-04 20:33:12 -07:00
Woosuk Kwon	4624d98dbd	[Misc] Clean up RoPE forward_native (#8076 )	2024-09-04 20:31:48 -07:00
William Lin	1afc931987	[bugfix] >1.43 constraint for openai (#8169 ) Co-authored-by: Michael Goin <michael@neuralmagic.com>	2024-09-04 17:35:36 -07:00
Maureen McElaney	e01c2beb7d	[Doc] [Misc] Create CODE_OF_CONDUCT.md (#8161 )	2024-09-04 16:50:13 -07:00