bump version to v0.6.1.post1 (#8440 )

[bugfix] torch profiler bug for single gpu with GPUExecutor (#8354 )
[Bugfix] Fix async log stats (#8417 )
2024-09-12 21:39:49 -07:00 · 2024-09-12 21:30:00 -07:00 · 2024-09-12 20:48:59 -07:00 · 2024-09-13 03:47:52 +00:00 · 2024-09-13 03:21:42 +00:00 · 2024-09-13 11:06:28 +08:00
226 changed files with 12669 additions and 3917 deletions
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -71,13 +71,36 @@ mkdir -p ${HF_CACHE}
 HF_MOUNT="/root/.cache/huggingface"
 commands=$@
 echo "Commands:$commands"
 #ignore certain kernels tests
 if [[ $commands == *" kernels "* ]]; then
  commands="${commands} \
  --ignore=kernels/test_attention.py \
  --ignore=kernels/test_attention_selector.py \
  --ignore=kernels/test_blocksparse_attention.py \
  --ignore=kernels/test_causal_conv1d.py \
  --ignore=kernels/test_cutlass.py \
  --ignore=kernels/test_encoder_decoder_attn.py \
  --ignore=kernels/test_flash_attn.py \
  --ignore=kernels/test_flashinfer.py \
  --ignore=kernels/test_int8_quant.py \
  --ignore=kernels/test_machete_gemm.py \
  --ignore=kernels/test_mamba_ssm.py \
  --ignore=kernels/test_marlin_gemm.py \
  --ignore=kernels/test_moe.py \
  --ignore=kernels/test_prefix_prefill.py \
  --ignore=kernels/test_rand.py \
  --ignore=kernels/test_sampler.py"
 fi
 PARALLEL_JOB_COUNT=8
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
    #replace shard arguments
-    commands=${@//"--shard-id= "/"--shard-id=${GPU} "}
+    commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
    commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
    echo "Shard ${GPU} commands:$commands"
    docker run \
        --device /dev/kfd --device /dev/dri \
        --network host \
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -0,0 +1,33 @@
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Try building the docker image
 docker build -t cpu-test -f Dockerfile.ppc64le .
 # Setup cleanup
 remove_docker_container() { docker rm -f cpu-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 # Run the image, setting --shm-size=4g for tensor parallel.
 source /etc/environment
 #docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test
 # Run basic model test
 docker exec cpu-test bash -c "
  pip install pytest matplotlib einops transformers_stream_generator
  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
 # online inference
 docker exec cpu-test bash -c "
  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
  timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
  python3 benchmarks/benchmark_serving.py \
    --backend vllm \
    --dataset-name random \
    --model facebook/opt-125m \
    --num-prompts 20 \
    --endpoint /v1/completions \
    --tokenizer facebook/opt-125m"
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -30,6 +30,12 @@ docker exec cpu-test bash -c "
      --ignore=tests/models/test_jamba.py \
      --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
 # Run compressed-tensor test
 docker exec cpu-test bash -c "
  pytest -s -v \
  tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
  tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
 # online inference
 docker exec cpu-test bash -c "
  export VLLM_CPU_KVCACHE_SPACE=10 
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -50,6 +50,7 @@ steps:
  - tests/worker
  commands:
  - pytest -v -s async_engine # Async Engine
  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
  - pytest -v -s test_inputs.py
  - pytest -v -s multimodal
  - pytest -v -s test_utils.py # Utils
@@ -91,6 +92,7 @@ steps:
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/openai
  - pytest -v -s entrypoints/test_chat_utils.py
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 - label: Distributed Tests (4 GPUs) # 10min
@@ -158,6 +160,7 @@ steps:
    - python3 offline_inference_with_prefix.py
    - python3 llm_engine_example.py
    - python3 offline_inference_vision_language.py
    - python3 offline_inference_vision_language_multi_image.py
    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference_encoder_decoder.py
@@ -216,7 +219,8 @@ steps:
  commands:
    # See https://github.com/vllm-project/vllm/issues/5152
    - export VLLM_ATTENTION_BACKEND=XFORMERS
-    - pytest -v -s spec_decode
+    - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
    - pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
 - label: LoRA Test %N # 30min each
  mirror_hardwares: [amd]
@@ -227,6 +231,7 @@ steps:
  parallelism: 4
 - label: Kernels Test %N # 30min each
  mirror_hardwares: [amd]
  source_file_dependencies:
  - csrc/
  - vllm/attention
@@ -368,6 +373,7 @@ steps:
 - label: LoRA Long Context (Distributed) # 11min
  # This test runs llama 13B, so it is required to run on 4 GPUs.
  num_gpus: 4
  soft_fail: true
  source_file_dependencies:
  - vllm/lora
  - tests/lora/test_long_context
@@ -384,7 +390,18 @@ steps:
  - vllm/
  - tests/weight_loading
  commands:
-    - bash weight_loading/run_model_weight_loading_test.sh
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
 - label: Weight Loading Multiple GPU Test - Large Models # optional
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  gpu: a100
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt 
 ##### multi gpus test #####
--- a/.github/ISSUE_TEMPLATE/400-bug
+++ b/.github/ISSUE_TEMPLATE/400-bug
@@ -30,6 +30,15 @@ body:
      </details>
  validations:
    required: true
 - type: textarea
  attributes:
    label: Model Input Dumps
    description: |
      If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
    placeholder: |
      Upload the dumped input file.
  validations:
    required: false
 - type: textarea
  attributes:
    label: 🐛 Describe the bug
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -39,6 +39,16 @@ FIX #xxxx (*link existing issues this PR will resolve*)
    <li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
 </ul>
 <h3>Adding or changing kernels</h3>
 <p>Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.</p>
 <ul>
    <li>Make sure custom ops are registered following PyTorch guidelines: <a href="https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial">Custom C++ and CUDA Operators</a> and <a href="https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU">The Custom Operators Manual</a></li>
    <li>Custom operations that return <code>Tensors</code> require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.</li>
    <li>Use <a href="https://pytorch.org/docs/stable/library.html#torch.library.opcheck"><code>torch.libary.opcheck()</code></a> to test the function registration and meta-function for any registered ops.  See <code>tests/kernels</code> for examples.</li>
    <li>When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.</li>
    <li>If a new custom type is needed, see the following document: <a href="https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA">Custom Class Support in PT2</a>.
 </ul>
 <h3>Notes for Large Changes</h3>
 <p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -181,7 +181,6 @@ set(VLLM_EXT_SRC
  "csrc/pos_encoding_kernels.cu"
  "csrc/activation_kernels.cu"
  "csrc/layernorm_kernels.cu"
  "csrc/quantization/squeezellm/quant_cuda_kernel.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
  "csrc/quantization/fp8/common.cu"
@@ -196,9 +195,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  FetchContent_Declare(
        cutlass
        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        # CUTLASS 3.5.1
+        GIT_TAG v3.5.1
        GIT_TAG 06b21349bcf6ddf6a1686a47a137ad1446579db9 
        GIT_PROGRESS TRUE
        # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
        # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
        # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
        GIT_SHALLOW TRUE
  )
  FetchContent_MakeAvailable(cutlass)
@@ -232,6 +235,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
          "-gencode arch=compute_90a,code=sm_90a")
  endif()
  #
  # Machete kernels
@@ -290,6 +294,12 @@ define_gpu_extension_target(
  USE_SABI 3
  WITH_SOABI)
 # If CUTLASS is compiled on NVCC >= 12.5, it by default uses 
 # cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the 
 # driver API. This causes problems when linking with earlier versions of CUDA.
 # Setting this variable sidesteps the issue by calling the driver directly.
 target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 #
 # _moe_C extension
 #
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,128 @@
 # vLLM Code of Conduct
 ## Our Pledge
 We as members, contributors, and leaders pledge to make participation in our
 community a harassment-free experience for everyone, regardless of age, body
 size, visible or invisible disability, ethnicity, sex characteristics, gender
 identity and expression, level of experience, education, socioeconomic status,
 nationality, personal appearance, race, caste, color, religion, or sexual
 identity and orientation.
 We pledge to act and interact in ways that contribute to an open, welcoming,
 diverse, inclusive, and healthy community.
 ## Our Standards
 Examples of behavior that contributes to a positive environment for our
 community include:
 * Demonstrating empathy and kindness toward other people
 * Being respectful of differing opinions, viewpoints, and experiences
 * Giving and gracefully accepting constructive feedback
 * Accepting responsibility and apologizing to those affected by our mistakes,
  and learning from the experience
 * Focusing on what is best not just for us as individuals, but for the overall
  community
 Examples of unacceptable behavior include:
 * The use of sexualized language or imagery, and sexual attention or advances of
  any kind
 * Trolling, insulting or derogatory comments, and personal or political attacks
 * Public or private harassment
 * Publishing others' private information, such as a physical or email address,
  without their explicit permission
 * Other conduct which could reasonably be considered inappropriate in a
  professional setting
 ## Enforcement Responsibilities
 Community leaders are responsible for clarifying and enforcing our standards of
 acceptable behavior and will take appropriate and fair corrective action in
 response to any behavior that they deem inappropriate, threatening, offensive,
 or harmful.
 Community leaders have the right and responsibility to remove, edit, or reject
 comments, commits, code, wiki edits, issues, and other contributions that are
 not aligned to this Code of Conduct, and will communicate reasons for moderation
 decisions when appropriate.
 ## Scope
 This Code of Conduct applies within all community spaces, and also applies when
 an individual is officially representing the community in public spaces.
 Examples of representing our community include using an official email address,
 posting via an official social media account, or acting as an appointed
 representative at an online or offline/IRL event.
 ## Enforcement
 Instances of abusive, harassing, or otherwise unacceptable behavior may be
 reported to the community leaders responsible for enforcement in the #code-of-conduct
 channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
 All complaints will be reviewed and investigated promptly and fairly.
 All community leaders are obligated to respect the privacy and security of the
 reporter of any incident.
 ## Enforcement Guidelines
 Community leaders will follow these Community Impact Guidelines in determining
 the consequences for any action they deem in violation of this Code of Conduct:
 ### 1. Correction
 **Community Impact**: Use of inappropriate language or other behavior deemed
 unprofessional or unwelcome in the community.
 **Consequence**: A private, written warning from community leaders, providing
 clarity around the nature of the violation and an explanation of why the
 behavior was inappropriate. A public apology may be requested.
 ### 2. Warning
 **Community Impact**: A violation through a single incident or series of
 actions.
 **Consequence**: A warning with consequences for continued behavior. No
 interaction with the people involved, including unsolicited interaction with
 those enforcing the Code of Conduct, for a specified period of time. This
 includes avoiding interactions in community spaces as well as external channels
 like social media. Violating these terms may lead to a temporary or permanent
 ban.
 ### 3. Temporary Ban
 **Community Impact**: A serious violation of community standards, including
 sustained inappropriate behavior.
 **Consequence**: A temporary ban from any sort of interaction or public
 communication with the community for a specified period of time. No public or
 private interaction with the people involved, including unsolicited interaction
 with those enforcing the Code of Conduct, is allowed during this period.
 Violating these terms may lead to a permanent ban.
 ### 4. Permanent Ban
 **Community Impact**: Demonstrating a pattern of violation of community
 standards, including sustained inappropriate behavior, harassment of an
 individual, or aggression toward or disparagement of classes of individuals.
 **Consequence**: A permanent ban from any sort of public interaction within the
 community.
 ## Attribution
 This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/),
 version 2.1, available at
 [v2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html).
 Community Impact Guidelines were inspired by
 [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion).
 For answers to common questions about this code of conduct, see the
 [Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
 [Contributor Covenant translations](https://www.contributor-covenant.org/translations).
--- a/11
+++ b/11
@@ -10,7 +10,7 @@ ARG CUDA_VERSION=12.4.1
 # prepare basic build environment
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
 ARG CUDA_VERSION=12.4.1
-ARG PYTHON_VERSION=3.10
+ARG PYTHON_VERSION=3.12
 ENV DEBIAN_FRONTEND=noninteractive
 # Install Python and other dependencies
@@ -37,7 +37,6 @@ WORKDIR /workspace
 # install build and runtime dependencies
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-adag.txt requirements-adag.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -r requirements-cuda.txt
@@ -66,7 +65,6 @@ COPY setup.py setup.py
 COPY cmake cmake
 COPY CMakeLists.txt CMakeLists.txt
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-adag.txt requirements-adag.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 COPY pyproject.toml pyproject.toml
 COPY vllm vllm
@@ -135,7 +133,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 # image with vLLM installed
 FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
 ARG CUDA_VERSION=12.4.1
-ARG PYTHON_VERSION=3.10
+ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
@@ -147,6 +145,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
    && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
    && add-apt-repository ppa:deadsnakes/ppa \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
@@ -181,6 +180,10 @@ FROM vllm-base AS test
 ADD . /vllm-workspace/
 # install development dependencies (for testing)
 # A newer setuptools is required for installing some test dependencies from source that do not publish python 3.12 wheels
 # This installation must complete before the test dependencies are collected and installed.
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install "setuptools>=74.1.1"
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -r requirements-dev.txt
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -2,9 +2,14 @@
 FROM ubuntu:22.04 AS cpu-test-1
 ENV CCACHE_DIR=/root/.cache/ccache
 ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
 RUN --mount=type=cache,target=/var/cache/apt \
    apt-get update -y \
    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 # https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
@@ -25,6 +30,19 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    pip install --upgrade pip && \
    pip install -r requirements-build.txt
 # install oneDNN
 RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
 RUN --mount=type=cache,target=/root/.cache/ccache \
    cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \ 
    -DONEDNN_BUILD_DOC=OFF \ 
    -DONEDNN_BUILD_EXAMPLES=OFF \ 
    -DONEDNN_BUILD_TESTS=OFF \ 
    -DONEDNN_BUILD_GRAPH=OFF \ 
    -DONEDNN_ENABLE_WORKLOAD=INFERENCE \ 
    -DONEDNN_ENABLE_PRIMITIVE=MATMUL && \
    cmake --build ./oneDNN/build --target install --config Release
 FROM cpu-test-1 AS build
 WORKDIR /workspace/vllm
@@ -40,7 +58,6 @@ COPY ./ ./
 ARG VLLM_CPU_DISABLE_AVX512
 ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=cache,target=/root/.cache/ccache \
    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -6,7 +6,9 @@ FROM $BASE_IMAGE
 RUN echo "Base image is $BASE_IMAGE"
 # Install some basic utilities
-RUN apt-get update && apt-get install python3 python3-pip -y
+RUN apt-get update \
    && apt-get install python3 python3-pip -y \
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 
 ### Mount Point ###
 # When launching the container, mount the code directory to /app
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -4,7 +4,8 @@
 FROM ubuntu:22.04 AS dev
 RUN apt-get update -y && \
-    apt-get install -y python3-pip git
+    apt-get install -y python3-pip git && \
    apt-get install -y ffmpeg libsm6 libxext6 libgl1 
 WORKDIR /workspace
 # copy requirements
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -2,21 +2,26 @@ FROM mambaorg/micromamba
 ARG MAMBA_DOCKERFILE_ACTIVATE=1
 USER root
-RUN apt-get update  -y     && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
 RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 
 # Some packages in requirements-cpu are installed here
 # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
 # Currently these may not be available for venv or pip directly
-RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults     python=3.10     pytorch-cpu=2.1.2     torchvision-cpu=0.16.2    &&     micromamba clean --all --yes
+RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes
 COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm
 # These packages will be in rocketce eventually
-RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
+RUN pip install -v cmake xformers torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
 RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
-WORKDIR /vllm-workspace
+WORKDIR /workspace/
-ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
+
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -4,6 +4,9 @@ ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:night
 FROM $BASE_IMAGE
 WORKDIR /workspace
 # Install some basic utilities
 RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libgl1 
 # Install the TPU and Pallas dependencies.
 RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
 RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -9,8 +9,7 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
    chmod 644 /usr/share/keyrings/intel-graphics.gpg
 RUN apt-get update  -y \
-&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
+&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1 
 COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,5 +1,4 @@
 include LICENSE
 include requirements-adag.txt
 include requirements-common.txt
 include requirements-cuda.txt
 include requirements-rocm.txt
--- a/README.md
+++ b/README.md
@@ -17,15 +17,16 @@ Easy, fast, and cheap LLM serving for everyone
 ---
-**vLLM & NVIDIA Triton User Meetup (Monday, September 9, 5pm-9pm PT) at Fort Mason, San Francisco**
+**vLLM, AMD, Anyscale Meet & Greet at [Ray Summit 2024](http://raysummit.anyscale.com) (Monday, Sept 30th, 5-7pm PT) at Marriott Marquis San Francisco**
-We are excited to announce our sixth vLLM Meetup, in collaboration with NVIDIA Triton Team.
+We are excited to announce our special vLLM event in collaboration with AMD and Anyscale.
-Join us to hear the vLLM's recent update about performance.
+Join us to learn more about recent advancements of vLLM on MI300X.
-Register now [here](https://lu.ma/87q3nvnh) and be part of the event!
+Register [here](https://lu.ma/db5ld9n5) and be a part of the event!
 ---
 *Latest News* 🔥
 - [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
 - [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
 - [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
 - [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
@@ -130,3 +131,10 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
  year={2023}
 }
 ```
 ## Contact Us
 * For technical questions and feature requests, please use Github issues or discussions.
 * For discussing with fellow users, please use Discord.
 * For security disclosures, please use Github's security advisory feature.
 * For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -24,6 +24,7 @@ class RequestFuncInput:
    model: str
    best_of: int = 1
    use_beam_search: bool = False
    logprobs: Optional[int] = None
@dataclass
@@ -236,6 +237,7 @@ async def async_request_openai_completions(
            "temperature": 0.0,
            "best_of": request_func_input.best_of,
            "max_tokens": request_func_input.output_len,
            "logprobs": request_func_input.logprobs,
            "stream": True,
        }
        headers = {
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -10,7 +10,7 @@ import torch
 from tqdm import tqdm
 from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
 from vllm.inputs import PromptInputs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.utils import FlexibleArgumentParser
@@ -205,13 +205,11 @@ if __name__ == '__main__':
        default=None,
        help=('path to save the pytorch profiler output. Can be visualized '
              'with ui.perfetto.dev or Tensorboard.'))
-    parser.add_argument(
+    parser.add_argument("--device",
-        "--device",
+                        type=str,
-        type=str,
+                        default="auto",
-        default="auto",
+                        choices=DEVICE_OPTIONS,
-        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
+                        help='device type for vLLM execution')
        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
        'CPU.')
    parser.add_argument('--block-size',
                        type=int,
                        default=16,
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -195,8 +195,16 @@ def sample_sonnet_requests(
 def sample_random_requests(
-        input_len: int, output_len: int, num_prompts: int, range_ratio: float,
+    prefix_len: int,
-        tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:
+    input_len: int,
    output_len: int,
    num_prompts: int,
    range_ratio: float,
    tokenizer: PreTrainedTokenizerBase,
 ) -> List[Tuple[str, int, int]]:
    prefix_token_ids = np.random.randint(0,
                                         tokenizer.vocab_size,
                                         size=prefix_len).tolist()
    input_lens = np.random.randint(
        int(input_len * range_ratio),
@@ -211,10 +219,12 @@ def sample_random_requests(
    offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
    input_requests = []
    for i in range(num_prompts):
-        prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
+        prompt = tokenizer.decode(prefix_token_ids +
                                  [(offsets[i] + i + j) % tokenizer.vocab_size
                                   for j in range(input_lens[i])])
        input_requests.append(
-            (prompt, int(input_lens[i]), int(output_lens[i])))
+            (prompt, int(prefix_len + input_lens[i]), int(output_lens[i])))
    return input_requests
@@ -318,6 +328,7 @@ async def benchmark(
    model_id: str,
    tokenizer: PreTrainedTokenizerBase,
    input_requests: List[Tuple[str, int, int]],
    logprobs: Optional[int],
    best_of: int,
    use_beam_search: bool,
    request_rate: float,
@@ -339,6 +350,7 @@ async def benchmark(
        api_url=api_url,
        prompt_len=test_prompt_len,
        output_len=test_output_len,
        logprobs=logprobs,
        best_of=best_of,
        use_beam_search=use_beam_search,
    )
@@ -358,6 +370,7 @@ async def benchmark(
            api_url=base_url + "/start_profile",
            prompt_len=test_prompt_len,
            output_len=test_output_len,
            logprobs=logprobs,
            best_of=best_of,
            use_beam_search=use_beam_search,
        )
@@ -379,6 +392,7 @@ async def benchmark(
            api_url=api_url,
            prompt_len=prompt_len,
            output_len=output_len,
            logprobs=logprobs,
            best_of=best_of,
            use_beam_search=use_beam_search,
        )
@@ -396,6 +410,7 @@ async def benchmark(
            api_url=base_url + "/stop_profile",
            prompt_len=test_prompt_len,
            output_len=test_output_len,
            logprobs=logprobs,
            best_of=best_of,
            use_beam_search=use_beam_search,
        )
@@ -562,6 +577,7 @@ def main(args: argparse.Namespace):
    elif args.dataset_name == "random":
        input_requests = sample_random_requests(
            prefix_len=args.random_prefix_len,
            input_len=args.random_input_len,
            output_len=args.random_output_len,
            num_prompts=args.num_prompts,
@@ -580,6 +596,7 @@ def main(args: argparse.Namespace):
            model_id=model_id,
            tokenizer=tokenizer,
            input_requests=input_requests,
            logprobs=args.logprobs,
            best_of=args.best_of,
            use_beam_search=args.use_beam_search,
            request_rate=args.request_rate,
@@ -721,6 +738,16 @@ if __name__ == "__main__":
        help=
        "Number of output tokens per request, used only for sonnet dataset.",
    )
    parser.add_argument(
        "--logprobs",
        type=int,
        default=None,
        help=("Number of logprobs-per-token to compute & return as part of "
              "the request. If unspecified, then either (1) if beam search "
              "is disabled, no logprobs are computed & a single dummy "
              "logprob is returned for each token; or (2) if beam search "
              "is enabled 1 logprob per token is computed"),
    )
    parser.add_argument(
        "--sonnet-prefix-len",
        type=int,
@@ -749,6 +776,14 @@ if __name__ == "__main__":
        help="Range of sampled ratio of input/output length, "
        "used only for random sampling.",
    )
    parser.add_argument(
        "--random-prefix-len",
        type=int,
        default=0,
        help="Number of fixed prefix tokens before random "
        " context. The length range of context in a random "
        " request is [random-prefix-len, "
        " random-prefix-len + random-prefix-len * random-range-ratio).")
    parser.add_argument(
        "--request-rate",
        type=float,
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -11,7 +11,7 @@ from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          PreTrainedTokenizerBase)
-from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.openai.api_server import (
    build_async_engine_client_from_engine_args)
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
@@ -451,13 +451,11 @@ if __name__ == "__main__":
        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
        'instead supported for common inference criteria.')
-    parser.add_argument(
+    parser.add_argument("--device",
-        "--device",
+                        type=str,
-        type=str,
+                        default="auto",
-        default="auto",
+                        choices=DEVICE_OPTIONS,
-        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
+                        help='device type for vLLM execution')
        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
        'CPU.')
    parser.add_argument(
        "--num-scheduler-steps",
        type=int,
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -1,4 +1,5 @@
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_CXX_STANDARD 17)
 #
 # Define environment variables for special configurations
@@ -83,12 +84,7 @@ endif()
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
-list(APPEND LIBS "numa")
+list(APPEND LIBS dnnl numa)
 #
 # Define extension targets
 #
 #
 # _C extension
@@ -102,6 +98,16 @@ set(VLLM_EXT_SRC
    "csrc/cpu/pos_encoding.cpp"
    "csrc/cpu/torch_bindings.cpp")
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
    set(VLLM_EXT_SRC
        "csrc/cpu/quant.cpp"
        ${VLLM_EXT_SRC})
 endif()
 #
 # Define extension targets
 #
 define_gpu_extension_target(
    _C
    DESTINATION vllm
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -350,6 +350,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
  target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
    ${GPU_INCLUDE_DIRECTORIES})
  # TODO: is torch_python_LIBRARY needed?
  target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${torch_python_LIBRARY}
    ${GPU_LIBRARIES})
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -24,8 +24,8 @@ namespace vec_op {
 #define CPU_KERNEL_GUARD_OUT(NAME)
 #else
 #define CPU_KERNEL_GUARD_IN(NAME)                                              \
-  std::cout << #NAME << " invoked." << std::endl;
+  RECORD_FUNCTION(#NAME, c10::ArrayRef<c10::IValue>({}));
-#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
+#define CPU_KERNEL_GUARD_OUT(NAME)
 #endif
 #define FORCE_INLINE __attribute__((always_inline)) inline
@@ -106,6 +106,12 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
  explicit BF16Vec16(const FP32Vec16 &);
  void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
  void save(void* ptr, const int elem_num) const {
    constexpr uint32_t M = 0xFFFFFFFF;
    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
    _mm256_mask_storeu_epi16(ptr, mask, reg);
  }
 };
 #ifdef __AVX512F__
@@ -313,8 +319,28 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
    return FP32Vec16(_mm512_div_ps(reg, b.reg));
  }
  FP32Vec16 clamp(const FP32Vec16& min, const FP32Vec16& max) const {
    return FP32Vec16(_mm512_min_ps(max.reg, _mm512_max_ps(min.reg, reg)));
  }
  FP32Vec16 max(const FP32Vec16& b) const {
    return FP32Vec16(_mm512_max_ps(reg, b.reg));
  }
  FP32Vec16 max(const FP32Vec16& b, const int elem_num) const {
    constexpr uint32_t M = 0xFFFFFFFF;
    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
    return FP32Vec16(_mm512_mask_max_ps(reg, mask, reg, b.reg));
  }
  FP32Vec16 abs() const {
    return FP32Vec16(_mm512_abs_ps(reg));
  } 
  float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
  float reduce_max() const { return _mm512_reduce_max_ps(reg); }
  template <int group_size> float reduce_sub_sum(int idx) {
    static_assert(VEC_ELEM_NUM % group_size == 0);
    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
@@ -323,6 +349,12 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
  }
  void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
  void save(float* ptr, const int elem_num) const {
    constexpr uint32_t M = 0xFFFFFFFF;
    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
    _mm512_mask_storeu_ps(ptr, mask, reg);
  }
 };
 #else
 struct FP32Vec16 : public Vec<FP32Vec16> {
@@ -433,6 +465,32 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 };
 #endif
 #ifdef __AVX512F__
 struct INT8Vec16: public Vec<INT8Vec16> {
  constexpr static int VEC_ELEM_NUM = 16;
  union AliasReg {
    __m128i reg;
    int8_t values[VEC_ELEM_NUM];
  };
  __m128i reg;
  explicit INT8Vec16(const FP32Vec16& vec) : reg(
    _mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32(vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))
  ) {}
  void save(int8_t* ptr) const {
    _mm_storeu_epi8(ptr, reg);
  }
  void save(int8_t* ptr, const int elem_num) const {
    constexpr uint32_t M = 0xFFFFFFFF;
    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
    _mm_mask_storeu_epi8(ptr, mask, reg);
  }
 };
 #endif
 template <typename T> struct VecType { using vec_type = void; };
 template <typename T> using vec_t = typename VecType<T>::vec_type;
--- a/csrc/cpu/dnnl_helper.hpp
+++ b/csrc/cpu/dnnl_helper.hpp
@@ -0,0 +1,168 @@
 #ifndef DNNL_HELPER_HPP
 #define DNNL_HELPER_HPP
 #include <c10/util/BFloat16.h>
 #include "oneapi/dnnl/dnnl.hpp"
 namespace {
 template <typename T>
 struct DNNLType {
  static constexpr dnnl::memory::data_type type =
      dnnl::memory::data_type::undef;
 };
 template <>
 struct DNNLType<int8_t> {
  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8;
 };
 template <>
 struct DNNLType<int32_t> {
  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32;
 };
 template <>
 struct DNNLType<float> {
  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32;
 };
 template <>
 struct DNNLType<c10::BFloat16> {
  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
 };
 template <typename T>
 constexpr inline dnnl::memory::data_type get_dnnl_type() {
  return DNNLType<std::decay_t<T>>::type;
 }
 };  // namespace
 template <bool InputNoScale>
 class DNNLPrimitiveHelper {
 public:
  // I8 input GEMM kernel (C = a_scales * A @ (b_scales * B^T) + bias)
  // A: [M, K], row-major
  // B: [K, N], column-major
  // C: [M, N], row-major
  // bias: [N], row-major, optional
  // a_scales: [MS]
  // b_scales: [NS]
  // Note: Due to the limitation of oneDNN
  // (https://github.com/oneapi-src/oneDNN/issues/1636), the quantized bias is
  // not supported.
  template <typename OutputT, typename BiasT>
  static void gemm_s8s8_jit(const int8_t* a, const int8_t* b, OutputT* c,
                            const BiasT* bias, dnnl_dim_t M, dnnl_dim_t N,
                            dnnl_dim_t K, const float* a_scales,
                            const float* b_scales, dnnl_dim_t MS,
                            dnnl_dim_t NS) {
    auto&& OutputType = get_dnnl_type<OutputT>();
    auto&& BiasType = get_dnnl_type<BiasT>();
    dnnl::memory::desc a_md({M, K}, dnnl::memory::data_type::s8, {K, 1});
    dnnl::memory::desc b_md({K, N}, dnnl::memory::data_type::s8, {1, K});
    dnnl::memory::desc c_md({M, N}, OutputType, {N, 1});
    dnnl::primitive_attr attr;
    if constexpr (!InputNoScale) {
      if (MS == 1) {
        // per-tensor
        attr.set_scales_mask(DNNL_ARG_SRC, 0);
      } else {
        // per-token
        TORCH_CHECK(false, "per-token quantization is unsupported.");
      }
    }
    if (NS == 1) {
      // per-tensor
      attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
    } else {
      // per-channel
      attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2);
    }
    dnnl::matmul::primitive_desc matmul_pd;
    if (bias) {
      dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1});
      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
                                               bias_md, c_md, attr);
    } else {
      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
                                               c_md, attr);
    }
    dnnl::matmul matmul(matmul_pd);
    auto& engine = default_engine();
    dnnl::memory a_m(a_md, engine, (void*)a);
    dnnl::memory b_m(b_md, engine, (void*)b);
    dnnl::memory c_m(c_md, engine, (void*)c);
    dnnl::memory a_scales_m({{MS}, dnnl::memory::data_type::f32, {1}}, engine,
                            (void*)a_scales);
    dnnl::memory b_scales_m({{NS}, dnnl::memory::data_type::f32, {1}}, engine,
                            (void*)b_scales);
    auto& stream = default_stream();
    if constexpr (InputNoScale) {
      if (bias) {
        dnnl::memory::desc bias_md({N}, BiasType, {1});
        dnnl::memory bias_m(bias_md, engine, (void*)bias);
        matmul.execute(
            stream, {
                        {DNNL_ARG_SRC, a_m},
                        {DNNL_ARG_WEIGHTS, b_m},
                        {DNNL_ARG_BIAS, bias_m},
                        {DNNL_ARG_DST, c_m},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
                    });
      } else {
        matmul.execute(
            stream, {
                        {DNNL_ARG_SRC, a_m},
                        {DNNL_ARG_WEIGHTS, b_m},
                        {DNNL_ARG_DST, c_m},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
                    });
      }
    } else {
      if (bias) {
        dnnl::memory::desc bias_md({N}, BiasType, {1});
        dnnl::memory bias_m(bias_md, engine, (void*)bias);
        matmul.execute(
            stream, {
                        {DNNL_ARG_SRC, a_m},
                        {DNNL_ARG_WEIGHTS, b_m},
                        {DNNL_ARG_BIAS, bias_m},
                        {DNNL_ARG_DST, c_m},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
                    });
      } else {
        matmul.execute(
            stream, {
                        {DNNL_ARG_SRC, a_m},
                        {DNNL_ARG_WEIGHTS, b_m},
                        {DNNL_ARG_DST, c_m},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
                    });
      }
    }
    stream.wait();
  }
 private:
  static dnnl::engine& default_engine() {
    static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
    return engine;
  }
  static dnnl::stream& default_stream() {
    static dnnl::stream stream(default_engine());
    return stream;
  }
 };
 #endif
--- a/csrc/cpu/quant.cpp
+++ b/csrc/cpu/quant.cpp
@@ -0,0 +1,294 @@
 #include "cpu_types.hpp"
 #include "dnnl_helper.hpp"
 namespace {
 template <typename scalar_t>
 struct KernelVecType {
  using load_vec_type = void;
  using cvt_vec_type = void;
 };
 template <>
 struct KernelVecType<float> {
  using load_vec_type = vec_op::FP32Vec16;
  using cvt_vec_type = vec_op::FP32Vec16;
 };
 template <>
 struct KernelVecType<c10::BFloat16> {
  using load_vec_type = vec_op::BF16Vec16;
  using cvt_vec_type = vec_op::FP32Vec16;
 };
 #ifdef __AVX512F__
 template <typename scalar_t>
 void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
                                   const float* scale, const int num_tokens,
                                   const int hidden_size) {
  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
  constexpr float i8_min =
      static_cast<float>(std::numeric_limits<int8_t>::min());
  constexpr float i8_max =
      static_cast<float>(std::numeric_limits<int8_t>::max());
  const cvt_vec_t inv_scale(1.0 / *scale);
  const cvt_vec_t i8_min_vec(i8_min);
  const cvt_vec_t i8_max_vec(i8_max);
  #pragma omp parallel for
  for (int i = 0; i < num_tokens; ++i) {
    int j = 0;
    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
      load_vec_t elems(input + i * hidden_size + j);
      cvt_vec_t elems_fp32(elems);
      elems_fp32 = (elems_fp32 * inv_scale).clamp(i8_min_vec, i8_max_vec);
      vec_op::INT8Vec16 elems_int8(elems_fp32);
      elems_int8.save(output + i * hidden_size + j);
    }
    load_vec_t elems(input + i * hidden_size + j);
    cvt_vec_t elems_fp32(elems);
    elems_fp32 = (elems_fp32 * inv_scale).clamp(i8_min_vec, i8_max_vec);
    vec_op::INT8Vec16 elems_int8(elems_fp32);
    if (j + vec_elem_num == hidden_size) {
      elems_int8.save(output + i * hidden_size + j);
    } else {
      elems_int8.save(output + i * hidden_size + j, hidden_size - j);
    }
  }
 }
 template <typename scalar_t>
 void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
                                    float* scale, const int num_tokens,
                                    const int hidden_size) {
  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
  #pragma omp parallel for
  for (int i = 0; i < num_tokens; ++i) {
    cvt_vec_t max_abs(0.0);
    {
      int j = 0;
      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
        load_vec_t elems(input + i * hidden_size + j);
        cvt_vec_t elems_fp32(elems);
        max_abs = max_abs.max(elems_fp32.abs());
      }
      load_vec_t elems(input + i * hidden_size + j);
      cvt_vec_t elems_fp32(elems);
      if (j + vec_elem_num == hidden_size) {
        max_abs = max_abs.max(elems_fp32.abs());
      } else {
        max_abs = max_abs.max(elems_fp32.abs(), hidden_size - j);
      }
    }
    float scale_val = max_abs.reduce_max() / 127.0f;
    scale[i] = scale_val;
    const cvt_vec_t inv_scale(1.0 / scale_val);
    {
      int j = 0;
      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
        load_vec_t elems(input + i * hidden_size + j);
        cvt_vec_t elems_fp32(elems);
        elems_fp32 = (elems_fp32 * inv_scale);
        vec_op::INT8Vec16 elems_int8(elems_fp32);
        elems_int8.save(output + i * hidden_size + j);
      }
      load_vec_t elems(input + i * hidden_size + j);
      cvt_vec_t elems_fp32(elems);
      elems_fp32 = (elems_fp32 * inv_scale);
      vec_op::INT8Vec16 elems_int8(elems_fp32);
      if (j + vec_elem_num == hidden_size) {
        elems_int8.save(output + i * hidden_size + j);
      } else {
        elems_int8.save(output + i * hidden_size + j, hidden_size - j);
      }
    }
  }
 }
 template <bool Bias, typename scalar_t>
 void dynamic_output_scale_impl(const float* input, scalar_t* output,
                               const float* scale, const scalar_t* bias,
                               const int num_tokens, const int hidden_size) {
  CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl)
  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
  #pragma omp parallel for
  for (int i = 0; i < num_tokens; ++i) {
    int j = 0;
    cvt_vec_t token_scale_vec(scale[i]);
    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
      cvt_vec_t elems_fp32(input + i * hidden_size + j);
      elems_fp32 = elems_fp32 * token_scale_vec;
      if constexpr (Bias) {
        load_vec_t bias_vec(bias + j);
        cvt_vec_t bias_vec_fp32(bias_vec);
        elems_fp32 = elems_fp32 + bias_vec_fp32;
      }
      load_vec_t elems_out(elems_fp32);
      elems_out.save(output + i * hidden_size + j);
    }
    cvt_vec_t elems_fp32(input + i * hidden_size + j);
    elems_fp32 = elems_fp32 * token_scale_vec;
    if constexpr (Bias) {
      load_vec_t bias_vec(bias + j);
      cvt_vec_t bias_vec_fp32(bias_vec);
      elems_fp32 = elems_fp32 + bias_vec_fp32;
    }
    load_vec_t elems_out(elems_fp32);
    if (j + vec_elem_num == hidden_size) {
      elems_out.save(output + i * hidden_size + j);
    } else {
      elems_out.save(output + i * hidden_size + j, hidden_size - j);
    }
  }
 }
 #else
 template <typename scalar_t>
 void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
                                   const float* scale, const int num_tokens,
                                   const int hidden_size) {
  TORCH_CHECK(false, "static_scaled_int8_quant_impl requires AVX512 support.")
 }
 template <typename scalar_t>
 void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
                                    float* scale, const int num_tokens,
                                    const int hidden_size) {
  TORCH_CHECK(false, "dynamic_scaled_int8_quant_impl requires AVX512 support.")
 }
 template <typename scalar_t>
 void dynamic_output_scale_impl() {
  TORCH_CHECK(false, "dynamic_output_scale_impl requires AVX512 support.")
 }
 #endif
 }  // namespace
 void int8_scaled_mm(torch::Tensor& c,               // [M, OC], row-major
                    const torch::Tensor& a,         // [M, IC], row-major
                    const torch::Tensor& b,         // [IC, OC], column-major
                    const torch::Tensor& a_scales,  // [1] or [M]
                    const torch::Tensor& b_scales,  // [1] or [OC]
                    const c10::optional<torch::Tensor>& bias  // [OC]
 ) {
  CPU_KERNEL_GUARD_IN(cutlass_scaled_mm)
  // Checks for conformality
  TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
              "int8_scaled_mm only supports INT8 inputs.")
  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
              b.size(1) == c.size(1));
  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
  // Check for strides and alignment
  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
  TORCH_CHECK(c.stride(0) % 16 == 0 &&
              b.stride(1) % 16 == 0);  // 16 Byte Alignment
  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
  if (bias) {
    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
                bias->dim() == 1);
  }
  VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "cutlass_scaled_mm", [&] {
    if (a_scales.numel() != 1) {
      // per-token
      // Note: oneDNN doesn't support per-token activation quantization
      torch::Tensor tmp_fp32_out =
          torch::empty_like(c, ::at::ScalarType::Float);
      DNNLPrimitiveHelper<true>::gemm_s8s8_jit(
          a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
          tmp_fp32_out.data_ptr<float>(), (void*)(0), a.size(0), b.size(1),
          a.size(1), (float*)(0), b_scales.data_ptr<float>(), 0,
          b_scales.numel());
      if (bias.has_value()) {
        dynamic_output_scale_impl<true>(
            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
            a_scales.data_ptr<float>(), bias->data_ptr<scalar_t>(), c.size(0),
            c.size(1));
      } else {
        dynamic_output_scale_impl<false>(
            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
            a_scales.data_ptr<float>(), (scalar_t*)(0), c.size(0), c.size(1));
      }
    } else {
      // per-tensor
      if (bias.has_value()) {
        DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
            bias->data_ptr<scalar_t>(), a.size(0), b.size(1), a.size(1),
            a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
            a_scales.numel(), b_scales.numel());
      } else {
        DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
            (void*)(0), a.size(0), b.size(1), a.size(1),
            a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
            a_scales.numel(), b_scales.numel());
      }
    }
  });
 }
 // static-per-tensor quantization.
 void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
                              const torch::Tensor& input,  // [..., hidden_size]
                              const torch::Tensor& scale) {
  CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(out.is_contiguous());
  TORCH_CHECK(scale.numel() == 1);
  const int hidden_size = input.size(-1);
  const int num_tokens = input.numel() / hidden_size;
  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "static_scaled_int8_quant_impl", [&] {
        static_scaled_int8_quant_impl(
            input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
            scale.data_ptr<float>(), num_tokens, hidden_size);
      });
 }
 // dynamic-per-token quantization.
 void dynamic_scaled_int8_quant(
    torch::Tensor& out,          // [..., hidden_size]
    const torch::Tensor& input,  // [..., hidden_size]
    torch::Tensor& scale         // [..., 1]
 ) {
  CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(out.is_contiguous());
  int const hidden_size = input.size(-1);
  int const num_tokens = input.numel() / hidden_size;
  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] {
        dynamic_scaled_int8_quant_impl(
            input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
            scale.data_ptr<float>(), num_tokens, hidden_size);
      });
 }
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -4,7 +4,12 @@
 #include <torch/library.h>
-void init_cpu_threads_env(const std::string& cpu_ids);
+std::string init_cpu_threads_env(const std::string& cpu_ids);
 void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
                    const torch::Tensor& b, const torch::Tensor& a_scales,
                    const torch::Tensor& b_scales,
                    const c10::optional<torch::Tensor>& bias);
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // vLLM custom ops
@@ -27,8 +32,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // PagedAttention V2.
  ops.def(
      "paged_attention_v2("
-      "    Tensor! out, Tensor exp_sums, Tensor max_logits,"
+      "    Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
-      "    Tensor tmp_out, Tensor query, Tensor key_cache,"
+      "    Tensor! tmp_out, Tensor query, Tensor key_cache,"
      "    Tensor value_cache, int num_kv_heads, float scale,"
      "    Tensor block_tables, Tensor seq_lens, int block_size,"
      "    int max_seq_len, Tensor? alibi_slopes,"
@@ -84,6 +89,28 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "                 Tensor! key, int head_size,"
      "                 Tensor cos_sin_cache, bool is_neox) -> ()");
  ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
  // Quantization
 #ifdef __AVX512F__
  // Compute int8 quantized tensor for given scaling factor.
  ops.def(
      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale) -> "
      "()");
  ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
  // Compute int8 quantized tensor and scaling factor
  ops.def(
      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale) -> "
      "()");
  ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
           &dynamic_scaled_int8_quant);
  // W8A8 GEMM, supporting symmetric per-tensor or per-row/column
  // quantization.
  ops.def(
      "cutlass_scaled_mm(Tensor! out, Tensor a,"
      "                  Tensor b, Tensor a_scales,"
      "                  Tensor b_scales, Tensor? bias) -> ()");
  ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
 #endif
 }
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
@@ -95,8 +122,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
  // Copy the cache blocks from src to dst.
  cache_ops.def(
-      "copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
+      "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
-      "block_mapping) -> ()");
+      "Tensor block_mapping) -> ()");
  cache_ops.impl("copy_blocks", torch::kCPU, &copy_blocks);
  // Reshape the key and value tensors and cache them.
@@ -111,7 +138,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
  // CPU utils
-  utils.def("init_cpu_threads_env(str cpu_ids) -> ()", &init_cpu_threads_env);
+  utils.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env);
 }
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -5,7 +5,7 @@
 #include "cpu_types.hpp"
-void init_cpu_threads_env(const std::string& cpu_ids) {
+std::string init_cpu_threads_env(const std::string& cpu_ids) {
  bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
  TORCH_CHECK(omp_cpu_mask->size > 0);
  std::vector<int> omp_cpu_ids;
@@ -51,15 +51,40 @@ void init_cpu_threads_env(const std::string& cpu_ids) {
  torch::set_num_threads((int)omp_cpu_ids.size());
  TORCH_CHECK_EQ(omp_cpu_ids.size(), torch::get_num_threads());
  TORCH_CHECK_EQ(omp_cpu_ids.size(), omp_get_max_threads());
  std::vector<std::pair<int, int>> thread_core_mapping;
  thread_core_mapping.reserve(omp_cpu_ids.size());
  omp_lock_t writelock;
  omp_init_lock(&writelock);
 #pragma omp parallel for schedule(static, 1)
  for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
-    cpu_set_t* mask = CPU_ALLOC(omp_cpu_mask->size);
+    cpu_set_t mask;
-    size_t size = CPU_ALLOC_SIZE(omp_cpu_mask->size);
+    CPU_ZERO(&mask);
-    CPU_ZERO_S(size, mask);
+    CPU_SET(omp_cpu_ids[i], &mask);
-    CPU_SET_S(omp_cpu_ids[i], size, mask);
+    int ret = sched_setaffinity(0, sizeof(cpu_set_t), &mask);
-    sched_setaffinity(0, sizeof(cpu_set_t), mask);
+    if (ret == -1) {
-    CPU_FREE(mask);
+      TORCH_CHECK(false,
                  "sched_setaffinity failed. errno: " + std::to_string(errno));
    }
    omp_set_lock(&writelock);
    thread_core_mapping.emplace_back(gettid(), omp_cpu_ids[i]);
    omp_unset_lock(&writelock);
  }
  omp_destroy_lock(&writelock);
  numa_free_nodemask(omp_cpu_mask);
  std::stringstream ss;
  ss << "OMP threads binding of Process " << getpid() << ":\n";
  std::sort(thread_core_mapping.begin(), thread_core_mapping.end(),
            [](auto&& a, auto&& b) { return a.second < b.second; });
  for (auto&& item : thread_core_mapping) {
    ss << "\t"
       << "OMP tid: " << item.first << ", core " << item.second << "\n";
  }
  return ss.str();
 }
--- a/csrc/moe/marlin_moe_ops.cu
+++ b/csrc/moe/marlin_moe_ops.cu
@@ -1737,4 +1737,4 @@ torch::Tensor marlin_gemm_moe(
      moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
      thread_n, sms, max_par, replicate_input, apply_weights);
  return c;
-}
+}
--- a/csrc/moe/marlin_moe_ops.h
+++ b/csrc/moe/marlin_moe_ops.h
@@ -9,4 +9,4 @@ torch::Tensor marlin_gemm_moe(
    const torch::Tensor& g_idx, const torch::Tensor& perm,
    torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
    bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size,
-    bool replicate_input, bool apply_weights);
+    bool replicate_input, bool apply_weights);
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -16,7 +16,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "g_idx, Tensor! perm, Tensor! workspace, int size_m, int size_n, int "
      "size_k, bool is_k_full, int num_experts, int topk, int moe_block_size, "
      "bool replicate_input, bool apply_weights) -> Tensor");
  m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
 #endif
 }
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -54,10 +54,21 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input);
 void gelu_quick(torch::Tensor& out, torch::Tensor& input);
-void advance_step(int64_t num_seqs, int64_t num_queries, int64_t block_size,
+void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
-                  torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
+                            int64_t block_size, torch::Tensor& input_tokens,
-                  torch::Tensor& input_positions, torch::Tensor& seq_lens,
+                            torch::Tensor& sampled_token_ids,
-                  torch::Tensor& slot_mapping, torch::Tensor& block_tables);
+                            torch::Tensor& input_positions,
                            torch::Tensor& seq_lens,
                            torch::Tensor& slot_mapping,
                            torch::Tensor& block_tables);
 void advance_step_flashinfer(
    int64_t num_seqs, int64_t num_queries, int64_t block_size,
    torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
    torch::Tensor& input_positions, torch::Tensor& seq_lens,
    torch::Tensor& slot_mapping, torch::Tensor& block_tables,
    torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
    torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
 #ifndef USE_ROCM
 torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
@@ -123,9 +134,17 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                 int64_t size_k, int64_t size_n,
                                 int64_t num_bits);
 torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
                                      torch::Tensor& perm, c10::SymInt size_k,
                                      c10::SymInt size_n, int64_t num_bits);
 torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
                                int64_t size_n, int64_t num_bits);
 torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
                                     c10::SymInt size_k, c10::SymInt size_n,
                                     int64_t num_bits);
 torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
                              int64_t n);
@@ -170,9 +189,6 @@ void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
 void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
                               torch::Tensor& scales);
 void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
                     torch::Tensor lookup_table);
 torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                        torch::Tensor b_gptq_qzeros,
                        torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
--- a/csrc/prepare_inputs/advance_step.cu
+++ b/csrc/prepare_inputs/advance_step.cu
@@ -12,13 +12,11 @@ namespace prepare_inputs {
 //
 template <int const num_threads>
-__global__ void advance_step_kernel(int num_seqs, int num_queries,
+__global__ void advance_step_flashattn_kernel(
-                                    int block_size, long* input_tokens_ptr,
+    int num_seqs, int num_queries, int block_size, long* input_tokens_ptr,
-                                    long const* sampled_token_ids_ptr,
+    long const* sampled_token_ids_ptr, long* input_positions_ptr,
-                                    long* input_positions_ptr,
+    int* seq_lens_ptr, long* slot_mapping_ptr, int const* block_tables_ptr,
-                                    int* seq_lens_ptr, long* slot_mapping_ptr,
+    int64_t const block_tables_stride) {
                                    int const* block_tables_ptr,
                                    int64_t const block_tables_stride) {
  int num_query_blocks = div_ceil(num_queries, num_threads);
  if (blockIdx.x >= num_query_blocks) {
@@ -79,16 +77,91 @@ inline void verify_tensor(std::string const& name, torch::Tensor& t,
  }
 }
-void advance_step(int num_seqs, int num_queries, int block_size,
+__global__ void advance_step_flashinfer_kernel(
-                  torch::Tensor& input_tokens,       // type: long
+    int num_threads, int num_seqs, int num_queries, int block_size,
-                  torch::Tensor& sampled_token_ids,  // type: long
+    long* input_tokens_ptr, long const* sampled_token_ids_ptr,
-                  torch::Tensor& input_positions,    // type: long
+    long* input_positions_ptr, int* seq_lens_ptr, long* slot_mapping_ptr,
-                  torch::Tensor& seq_lens,           // type: int
+    int const* block_tables_ptr, int64_t const block_tables_stride,
-                  torch::Tensor& slot_mapping,       // type: long
+    int* paged_kv_last_page_len_ptr, int* block_table_bound_ptr) {
-                  torch::Tensor& block_tables) {     // type: int
+  int num_query_blocks = div_ceil(num_queries, num_threads);
  if (blockIdx.x < num_query_blocks) {
    int cur_query_id = blockIdx.x * num_threads + threadIdx.x;
    if (cur_query_id < num_queries) {
      // Update input_tokens
      input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id];
      int seq_len = seq_lens_ptr[cur_query_id];
      int next_seq_len = seq_len + 1;
      int next_input_pos = next_seq_len - 1;
      // Update seq_lens
      seq_lens_ptr[cur_query_id] = next_seq_len;
      // Update input_positions
      input_positions_ptr[cur_query_id] = next_input_pos;
      int const* seq_block_tables_ptr =
          block_tables_ptr + block_tables_stride * cur_query_id;
      int block_index = next_input_pos / block_size;
      int block_offset = next_input_pos % block_size;
      // Update paged_kv_last_page_len
      paged_kv_last_page_len_ptr[cur_query_id] = block_offset + 1;
      int slot_num =
          seq_block_tables_ptr[block_index] * block_size + block_offset;
      // Update slot_mapping
      slot_mapping_ptr[cur_query_id] = slot_num;
      block_table_bound_ptr[cur_query_id] = div_ceil(next_seq_len, block_size);
    }
  }
 }
 __global__ void advance_step_flashinfer_indptr_kernel(
    int num_threads, int num_seqs, int num_queries, int* paged_kv_indptr_ptr,
    int* block_table_bound_ptr) {
  int idx = blockIdx.x * num_threads + threadIdx.x;
  // Update paged_kv_indptr
  if (idx < num_queries) {
    int sum = 0;
    for (int i = 0; i <= idx; ++i) {
      sum += block_table_bound_ptr[i];
    }
    paged_kv_indptr_ptr[idx + 1] = sum;
  }
 }
 __global__ void advance_step_flashinfer_indices_kernel(
    int num_threads, int num_seqs, int num_queries, int const* block_tables_ptr,
    int64_t const block_tables_stride, int* paged_kv_indices_ptr,
    int* paged_kv_indptr_ptr, int* block_table_bound_ptr) {
  int idx = blockIdx.x * num_threads + threadIdx.x;
  int row = idx / block_tables_stride;
  int col = idx % block_tables_stride;
  if (row < num_queries && col < block_table_bound_ptr[row]) {
    paged_kv_indices_ptr[paged_kv_indptr_ptr[row] + col] =
        block_tables_ptr[row * block_tables_stride + col];
  }
  // if cudagraph, fill padded seqs with the last valid seq's indptr
  if (num_queries < row && row <= num_seqs) {
    paged_kv_indptr_ptr[row] = paged_kv_indptr_ptr[num_queries];
  }
 }
 void advance_step_flashattn(int num_seqs, int num_queries, int block_size,
                            torch::Tensor& input_tokens,       // type: long
                            torch::Tensor& sampled_token_ids,  // type: long
                            torch::Tensor& input_positions,    // type: long
                            torch::Tensor& seq_lens,           // type: int
                            torch::Tensor& slot_mapping,       // type: long
                            torch::Tensor& block_tables) {     // type: int
  if (logging) {
-    printf("advance_step:\n");
+    printf("advance_step_flashattn:\n");
    printf("  num_seqs = %d\n", num_seqs);
    printf("  num_queries = %d\n", num_queries);
    printf("  block_size = %d\n", block_size);
@@ -108,24 +181,126 @@ void advance_step(int num_seqs, int num_queries, int block_size,
  int blocks;
  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
-  advance_step_kernel<max_threads><<<blocks, max_threads, 0, stream>>>(
+  advance_step_flashattn_kernel<max_threads>
-      num_seqs, num_queries, block_size,
+      <<<blocks, max_threads, 0, stream>>>(
          num_seqs, num_queries, block_size,
          reinterpret_cast<long*>(input_tokens.data_ptr()),
          reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
          reinterpret_cast<long*>(input_positions.data_ptr()),
          reinterpret_cast<int*>(seq_lens.data_ptr()),
          reinterpret_cast<long*>(slot_mapping.data_ptr()),
          reinterpret_cast<int const*>(block_tables.data_ptr()),
          block_tables.stride(0));
 }
 void advance_step_flashinfer(
    int num_seqs, int num_queries, int block_size,
    torch::Tensor& input_tokens,            // type: long
    torch::Tensor& sampled_token_ids,       // type: long
    torch::Tensor& input_positions,         // type: long
    torch::Tensor& seq_lens,                // type: int
    torch::Tensor& slot_mapping,            // type: long
    torch::Tensor& block_tables,            // type: int
    torch::Tensor& paged_kv_indices,        // type: int
    torch::Tensor& paged_kv_indptr,         // type: int
    torch::Tensor& paged_kv_last_page_len,  // type: int
    torch::Tensor& block_table_bound) {     // type: int
  if (logging) {
    printf("advance_step_flashinfer:\n");
    printf("  num_seqs = %d\n", num_seqs);
    printf("  num_queries = %d\n", num_queries);
    printf("  block_size = %d\n", block_size);
    printf("  block_tables.stride(0) = %d\n", block_tables.stride(0));
  }
  // Verify all tensors
  verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
  // verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1,
  //               at::kLong);
  verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong);
  verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt);
  verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong);
  verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt);
  verify_tensor("paged_kv_indices", paged_kv_indices, -1, -1, at::kInt);
  verify_tensor("paged_kv_indptr", paged_kv_indptr, num_seqs + 1, -1, at::kInt);
  verify_tensor("paged_kv_last_page_len", paged_kv_last_page_len, num_seqs, -1,
                at::kInt);
  verify_tensor("block_table_bound", block_table_bound, num_seqs, -1, at::kInt);
  int dev = sampled_token_ids.get_device();
  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
  int blocks;
  int threads;
  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
  cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
  if (logging) {
    printf("launching kernel with %d blocks\n", blocks);
  }
  // TODO(will): support arbitrary block_tables stride
  if ((blocks * threads) / block_tables.stride(0) < num_queries) {
    TORCH_CHECK(false,
                "multi-step: not enough threads to map block_table to"
                "FlashInfer's paged_kv_indices on GPU. Try reducing the number "
                "of seqs,",
                " increasing the block size or take smaller steps.",
                " num_queries = ", num_queries,
                " block_tables.stride(0) = ", block_tables.stride(0),
                " blocks = ", blocks, " max_threads = ", threads);
  }
  advance_step_flashinfer_kernel<<<blocks, threads, 0, stream>>>(
      threads, num_seqs, num_queries, block_size,
      reinterpret_cast<long*>(input_tokens.data_ptr()),
      reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
      reinterpret_cast<long*>(input_positions.data_ptr()),
      reinterpret_cast<int*>(seq_lens.data_ptr()),
      reinterpret_cast<long*>(slot_mapping.data_ptr()),
      reinterpret_cast<int const*>(block_tables.data_ptr()),
-      block_tables.stride(0));
+      block_tables.stride(0),
      reinterpret_cast<int*>(paged_kv_last_page_len.data_ptr()),
      reinterpret_cast<int*>(block_table_bound.data_ptr()));
  advance_step_flashinfer_indptr_kernel<<<blocks, threads, 0, stream>>>(
      threads, num_seqs, num_queries,
      reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
      reinterpret_cast<int*>(block_table_bound.data_ptr()));
  advance_step_flashinfer_indices_kernel<<<blocks, threads, 0, stream>>>(
      threads, num_seqs, num_queries,
      reinterpret_cast<int const*>(block_tables.data_ptr()),
      block_tables.stride(0),
      reinterpret_cast<int*>(paged_kv_indices.data_ptr()),
      reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
      reinterpret_cast<int*>(block_table_bound.data_ptr()));
 }
 }  // namespace prepare_inputs
-void advance_step(int64_t num_seqs, int64_t num_queries, int64_t block_size,
+void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
-                  torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
+                            int64_t block_size, torch::Tensor& input_tokens,
-                  torch::Tensor& input_positions, torch::Tensor& seq_lens,
+                            torch::Tensor& sampled_token_ids,
-                  torch::Tensor& slot_mapping, torch::Tensor& block_tables) {
+                            torch::Tensor& input_positions,
-  prepare_inputs::advance_step(num_seqs, num_queries, block_size, input_tokens,
+                            torch::Tensor& seq_lens,
-                               sampled_token_ids, input_positions, seq_lens,
+                            torch::Tensor& slot_mapping,
-                               slot_mapping, block_tables);
+                            torch::Tensor& block_tables) {
  prepare_inputs::advance_step_flashattn(
      num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
      input_positions, seq_lens, slot_mapping, block_tables);
 }
 void advance_step_flashinfer(
    int64_t num_seqs, int64_t num_queries, int64_t block_size,
    torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
    torch::Tensor& input_positions, torch::Tensor& seq_lens,
    torch::Tensor& slot_mapping, torch::Tensor& block_tables,
    torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
    torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bound) {
  prepare_inputs::advance_step_flashinfer(
      num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
      input_positions, seq_lens, slot_mapping, block_tables, paged_kv_indices,
      paged_kv_indptr, paged_kv_last_page_len, block_table_bound);
 }
--- a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
@@ -267,3 +267,15 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
 }
 #endif
 torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
                                     c10::SymInt size_k, c10::SymInt size_n,
                                     int64_t num_bits) {
  int const pack_factor = 32 / num_bits;
  auto options = torch::TensorOptions()
                     .dtype(b_q_weight.dtype())
                     .device(b_q_weight.device());
  return torch::empty_symint(
      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
      options);
 }
--- a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
@@ -342,3 +342,15 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
 }
 #endif
 torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
                                      torch::Tensor& perm, c10::SymInt size_k,
                                      c10::SymInt size_n, int64_t num_bits) {
  int const pack_factor = 32 / num_bits;
  auto options = torch::TensorOptions()
                     .dtype(b_q_weight.dtype())
                     .device(b_q_weight.device());
  return torch::empty_symint(
      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
      options);
 }
--- a/csrc/quantization/squeezellm/quant_cuda_kernel.cu
+++ b/csrc/quantization/squeezellm/quant_cuda_kernel.cu
@@ -1,216 +0,0 @@
 #include <torch/all.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
 // half-tensor
 #include <c10/cuda/CUDAStream.h>
 #include <ATen/cuda/CUDATensorMethods.cuh>
 #include <c10/cuda/CUDAGuard.h>
 #define BLOCKWIDTH 128
 #define BLOCKHEIGHT4 16
 namespace vllm {
 namespace squeezellm {
 __device__ inline unsigned int as_unsigned(int i) {
  return *reinterpret_cast<unsigned int*>(&i);
 }
 // 4-bit matvec kernel (LUT-based)
 __global__ void NUQ4MatMulKernel(
 #ifndef USE_ROCM
    const half2* __restrict__ vec,
 #else
    const __half2* __restrict__ vec,
 #endif
    const int* __restrict__ mat,
 #ifndef USE_ROCM
    half2* __restrict__ mul,
 #else
    float2* __restrict__ mul,
 #endif
    const __half* __restrict__ lookup_table, int height, int width, int batch,
    int vec_height) {
  const int blockwidth2 = BLOCKWIDTH / 2;
  int row = BLOCKHEIGHT4 * blockIdx.x;
  int col = BLOCKWIDTH * blockIdx.y + threadIdx.x;
 #ifndef USE_ROCM
  __shared__ half2 blockvec[blockwidth2];
 #else
  __shared__ __half2 blockvec[blockwidth2];
 #endif
  __shared__ __half deq2[16][BLOCKWIDTH];
  int off = threadIdx.x;
  int column_offset = col * 16;
  for (int val = 0; val < 16; val += 1) {
    int lut_index = column_offset + val;
    deq2[val][off] = lookup_table[lut_index];
  }
  __half res;
 #ifndef USE_ROCM
  half2 res2;
  half2 tmp2;
 #else
  __half2 res2;
  __half2 tmp2;
 #endif
  int i;
  int k;
  unsigned int tmp1;
  unsigned int lut_index1, lut_index2;
  for (int b = 0; b < batch; ++b) {
    i = width * row + col;
    res = __int2half_rd(0);
    k = 0;
    __syncthreads();
    if (threadIdx.x < blockwidth2)
      blockvec[threadIdx.x] =
          vec[b * vec_height / 2 + (row / BLOCKHEIGHT4) * blockwidth2 +
              threadIdx.x];
    __syncthreads();
    while (k < blockwidth2) {
      tmp1 = as_unsigned(mat[i]);
 #ifndef USE_ROCM
      res2 = {};
      tmp2 = {};
 #else
      res2.x = __half_as_ushort(__float2half(0));
      res2.y = __half_as_ushort(__float2half(0));
      tmp2.x = __half_as_ushort(__float2half(0));
      tmp2.y = __half_as_ushort(__float2half(0));
 #endif
      lut_index1 = tmp1 & 0xF;
      lut_index2 = (tmp1 >> 4) & 0xF;
 #ifndef USE_ROCM
      tmp2.x = deq2[lut_index1][off];
      tmp2.y = deq2[lut_index2][off];
 #else
      tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
      tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
 #endif
      res2 = __hfma2(tmp2, blockvec[k + 0], res2);
      lut_index1 = (tmp1 >> 8) & 0xF;
      lut_index2 = (tmp1 >> 12) & 0xF;
 #ifndef USE_ROCM
      tmp2.x = deq2[lut_index1][off];
      tmp2.y = deq2[lut_index2][off];
 #else
      tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
      tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
 #endif
      res2 = __hfma2(tmp2, blockvec[k + 1], res2);
      lut_index1 = (tmp1 >> 16) & 0xF;
      lut_index2 = (tmp1 >> 20) & 0xF;
 #ifndef USE_ROCM
      tmp2.x = deq2[lut_index1][off];
      tmp2.y = deq2[lut_index2][off];
 #else
      tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
      tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
 #endif
      res2 = __hfma2(tmp2, blockvec[k + 2], res2);
      lut_index1 = (tmp1 >> 24) & 0xF;
      lut_index2 = (tmp1 >> 28) & 0xF;
 #ifndef USE_ROCM
      tmp2.x = deq2[lut_index1][off];
      tmp2.y = deq2[lut_index2][off];
 #else
      tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
      tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
 #endif
      res2 = __hfma2(tmp2, blockvec[k + 3], res2);
 #ifndef USE_ROCM
      res = __hadd(__hadd(res2.x, res2.y), res);
 #else
      res = __hadd(__hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)),
                   res);
 #endif
      i += width;
      k += 4;
    }
    // col%2 -> only set one of the two values
 #ifndef USE_ROCM
    half2 res3 = {};
    if (col % 2 == 0) {
      res3.x = res;
    } else {
      res3.y = res;
    }
 #else
    __half2 res3;
    res3.x = __half_as_ushort(__float2half(0));
    res3.y = __half_as_ushort(__float2half(0));
    if (col % 2 == 0) {
      res3.x = __half_as_ushort(res);
    } else {
      res3.y = __half_as_ushort(res);
    }
 #endif
 #ifndef USE_ROCM
    atomicAdd(&mul[b * width / 2 + col / 2], res3);
 #else
    int tmp_addr = b * width / 2 + col / 2;
    atomicAdd(&(mul[tmp_addr].x), __half2float(__ushort_as_half(res3.x)));
    atomicAdd(&(mul[tmp_addr].y), __half2float(__ushort_as_half(res3.y)));
 #endif
  }
 }
 }  // namespace squeezellm
 }  // namespace vllm
 // 4-bit matvec kernel (LUT-based)
 void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
                     torch::Tensor lookup_table) {
  int height = mat.size(0);
  int width = mat.size(1);
  int batch = vec.size(0);
  int vec_height = vec.size(1);
  dim3 blocks((height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4,
              (width + BLOCKWIDTH - 1) / BLOCKWIDTH);
  dim3 threads(BLOCKWIDTH);
  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  vllm::squeezellm::NUQ4MatMulKernel<<<blocks, threads, 0, stream>>>(
 #ifndef USE_ROCM
      (half2*)vec.data_ptr<at::Half>(),
 #else
      (__half2*)vec.data_ptr<at::Half>(),
 #endif
      mat.data_ptr<int>(),
 #ifndef USE_ROCM
      (half2*)mul.data_ptr<at::Half>(),
      (__half*)lookup_table.data_ptr<at::Half>(),
 #else
      (float2*)mul.data_ptr<float>(),
      (__half*)lookup_table.data_ptr<at::Half>(),
 #endif
      height, width, batch, vec_height);
 }
 #undef BLOCKWIDTH
 #undef BLOCKHEIGHT4
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -36,8 +36,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // PagedAttention V2.
  ops.def(
      "paged_attention_v2("
-      "    Tensor! out, Tensor exp_sums, Tensor max_logits,"
+      "    Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
-      "    Tensor tmp_out, Tensor query, Tensor key_cache,"
+      "    Tensor! tmp_out, Tensor query, Tensor key_cache,"
      "    Tensor value_cache, int num_kv_heads, float scale,"
      "    Tensor block_tables, Tensor seq_lens, int block_size,"
      "    int max_seq_len, Tensor? alibi_slopes,"
@@ -73,8 +73,23 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.impl("gelu_quick", torch::kCUDA, &gelu_quick);
  // prepare_inputs advance_step
-  ops.def("advance_step", &advance_step);
+  ops.def(
-  ops.impl("advance_step", torch::kCUDA, &advance_step);
+      "advance_step_flashattn(int num_seqs, int num_queries, int block_size, "
      "Tensor! input_tokens, Tensor sampled_token_ids, "
      "Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping, "
      "Tensor block_tables) -> ()");
  ops.impl("advance_step_flashattn", torch::kCUDA, &advance_step_flashattn);
  ops.def(
      "advance_step_flashinfer("
      "    int num_seqs, int num_queries, int block_size,"
      "    Tensor! input_tokens, Tensor sampled_token_ids,"
      "    Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping,"
      "    Tensor block_tables, Tensor! paged_kv_indices,"
      "    Tensor! paged_kv_indptr, Tensor! paged_kv_last_page_len,"
      "    Tensor! block_table_bounds"
      ") -> ()");
  ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer);
  // Layernorm
  // Apply Root Mean Square (RMS) Normalization to the input tensor.
@@ -110,27 +125,56 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // Quantization ops
 #ifndef USE_ROCM
  // Quantized GEMM for AQLM.
-  ops.def("aqlm_gemm", &aqlm_gemm);
+  ops.def(
      "aqlm_gemm(Tensor input, Tensor codes, Tensor codebooks, "
      "Tensor scales, int[] codebook_partition_sizes, Tensor? bias) "
      "-> Tensor");
  ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm);
  // Decompression method for AQLM.
-  ops.def("aqlm_dequant", &aqlm_dequant);
+  ops.def(
      "aqlm_dequant(Tensor codes, Tensor codebooks, "
      "int[] codebook_partition_sizes) -> Tensor");
  ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant);
  // Quantized GEMM for AWQ.
-  ops.def("awq_gemm", &awq_gemm);
+  ops.def(
      "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, "
      "Tensor _zeros, int split_k_iters) -> Tensor");
  ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
  // Dequantization for AWQ.
-  ops.def("awq_dequantize", &awq_dequantize);
+  ops.def(
      "awq_dequantize(Tensor _kernel, Tensor _scaling_factors, "
      "Tensor _zeros, int split_k_iters, int thx, int thy) -> Tensor");
  ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
  // Note about marlin kernel 'workspace' arguments:
  // Technically these should be mutable since they are modified by the kernel.
  // But since they are set back to zero once the kernel is finished we can
  // hand wave and say that they have no net effect.
  //
  // The reason to mark 'workspace' as immutable is so that they don't interfere
  // with using ScalarType arguments in the ops. If they are marked as mutable,
  // pytorch throws an assert in
  // 'torch._higher_order_ops._register_effectful_op' that prevents these
  // kernels from being torch.compile'd.
  // See the following document for more info on custom types and ops that use
  // custom types:
  // https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA
  // Marlin (Dense) Optimized Quantized GEMM for GPTQ.
-  ops.def("marlin_gemm", &marlin_gemm);
+  ops.def(
      "marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
      "Tensor! workspace, int size_m, int size_n, int size_k) -> Tensor");
  ops.impl("marlin_gemm", torch::kCUDA, &marlin_gemm);
  // Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
-  ops.def("gptq_marlin_24_gemm", &gptq_marlin_24_gemm);
+  ops.def(
      "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, "
      "Tensor b_scales, Tensor workspace, "
      "__torch__.torch.classes._core_C.ScalarType b_q_type, "
      "int size_m, int size_n, int size_k) -> Tensor");
  ops.impl("gptq_marlin_24_gemm", torch::kCUDA, &gptq_marlin_24_gemm);
  // Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
@@ -149,35 +193,55 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.impl("machete_prepack_B", torch::kCUDA, &machete::prepack_B);
  // gptq_marlin Optimized Quantized GEMM for GPTQ.
-  ops.def("gptq_marlin_gemm", &gptq_marlin_gemm);
+  ops.def(
      "gptq_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
      "Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, "
      "__torch__.torch.classes._core_C.ScalarType b_q_type, "
      "int size_m, int size_n, int size_k, bool is_k_full, "
      "bool has_zp, bool use_fp32_reduce) -> Tensor");
  ops.impl("gptq_marlin_gemm", torch::kCUDA, &gptq_marlin_gemm);
  // gptq_marlin repack from GPTQ.
-  ops.def("gptq_marlin_repack", &gptq_marlin_repack);
+  ops.def(
      "gptq_marlin_repack(Tensor b_q_weight, Tensor perm, "
      "SymInt size_k, SymInt size_n, int num_bits) -> Tensor");
  ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack);
  ops.impl("gptq_marlin_repack", torch::kMeta, &gptq_marlin_repack_meta);
  // awq_marlin repack from AWQ.
-  ops.def("awq_marlin_repack", &awq_marlin_repack);
+  ops.def(
      "awq_marlin_repack(Tensor b_q_weight, SymInt size_k, "
      "SymInt size_n, int num_bits) -> Tensor");
  ops.impl("awq_marlin_repack", torch::kCUDA, &awq_marlin_repack);
  ops.impl("awq_marlin_repack", torch::kMeta, &awq_marlin_repack_meta);
  // Dequantization for GGML.
-  ops.def("ggml_dequantize", &ggml_dequantize);
+  ops.def("ggml_dequantize(Tensor W, int type, int m, int n) -> Tensor");
  ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);
  // mmvq kernel for GGML.
-  ops.def("ggml_mul_mat_vec_a8", &ggml_mul_mat_vec_a8);
+  ops.def(
      "ggml_mul_mat_vec_a8(Tensor W, Tensor X, int type, int row) "
      "-> Tensor");
  ops.impl("ggml_mul_mat_vec_a8", torch::kCUDA, &ggml_mul_mat_vec_a8);
  // mmq kernel for GGML.
-  ops.def("ggml_mul_mat_a8", &ggml_mul_mat_a8);
+  ops.def("ggml_mul_mat_a8(Tensor W, Tensor X, int type, int row) -> Tensor");
  ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8);
  // fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
-  ops.def("fp8_marlin_gemm", &fp8_marlin_gemm);
+  ops.def(
      "fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
      "Tensor! workspace, int num_bits, int size_m, int size_n, "
      "int size_k) -> Tensor");
  ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm);
  // marlin_qqq_gemm for QQQ.
-  ops.def("marlin_qqq_gemm", &marlin_qqq_gemm);
+  ops.def(
      "marlin_qqq_gemm(Tensor a, Tensor b_q_weight, "
      "Tensor s_tok, Tensor s_ch, Tensor s_group, "
      "Tensor! workspace, int size_m, int size_n, "
      "int size_k) -> Tensor");
  ops.impl("marlin_qqq_gemm", torch::kCUDA, &marlin_qqq_gemm);
  // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
@@ -199,16 +263,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // Check if cutlass scaled_mm is supported for CUDA devices of the given
  // capability
-  ops.def("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
+  ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
-  ops.impl("cutlass_scaled_mm_supports_fp8", torch::kCUDA,
+  ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
-           &cutlass_scaled_mm_supports_fp8);
+
  // Mamba selective scan kernel
  ops.def(
      "selective_scan_fwd(Tensor! u, Tensor! delta,"
      "Tensor! A, Tensor! B, Tensor! C,"
      "Tensor? D_, Tensor? z_, Tensor? delta_bias_,"
      "bool delta_softplus,"
-      "Tensor? index_, Tensor? x) -> Tensor[]");
+      "Tensor? index_, Tensor(a! -> *)? x) -> Tensor(a)[]");
  ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
  ops.def(
@@ -230,19 +294,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 #endif
  // Quantized GEMM for GPTQ.
-  ops.def("gptq_gemm", &gptq_gemm);
+  // Note: even though the C++ inferred schema is correct for this op, it seems
  // to prevent the meta function registry.
  ops.def(
      "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
      "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, int bit) "
      "-> Tensor");
  ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
  // Post processing for GPTQ.
  ops.def("gptq_shuffle(Tensor! q_weight, Tensor q_perm, int bit) -> ()");
  ops.impl("gptq_shuffle", torch::kCUDA, &gptq_shuffle);
  // Quantized GEMM for SqueezeLLM.
  ops.def(
      "squeezellm_gemm(Tensor vec, Tensor mat, Tensor! mul, Tensor "
      "lookup_table) -> ()");
  ops.impl("squeezellm_gemm", torch::kCUDA, &squeezellm_gemm);
  // Compute FP8 quantized tensor for given scaling factor.
  ops.def(
      "static_scaled_fp8_quant(Tensor! out, Tensor input, Tensor scale) -> ()");
@@ -256,8 +319,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // Compute dynamic-per-token FP8 quantized tensor and scaling factor.
  ops.def(
-      "dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, Tensor! "
+      "dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, "
-      "scale, Tensor? scale_ub) -> "
+      "Tensor! scale, Tensor? scale_ub) -> "
      "()");
  ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
           &dynamic_per_token_scaled_fp8_quant);
@@ -294,8 +357,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
  // Copy the cache blocks from src to dst.
  cache_ops.def(
-      "copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
+      "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
-      "block_mapping) -> ()");
+      "Tensor block_mapping) -> ()");
  cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);
  // Reshape the key and value tensors and cache them.
@@ -320,8 +383,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
  // Convert the key and value cache to fp8 data type.
  cache_ops.def(
-      "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, str "
+      "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
-      "kv_cache_dtype) -> ()");
+      "str kv_cache_dtype) -> ()");
  cache_ops.impl("convert_fp8", torch::kCUDA, &convert_fp8);
 }
@@ -329,24 +392,28 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
  // Cuda utils
  // Gets the specified device attribute.
-  cuda_utils.def("get_device_attribute", &get_device_attribute);
+  cuda_utils.def("get_device_attribute(int attribute, int device_id) -> int");
-  cuda_utils.impl("get_device_attribute", torch::kCUDA, &get_device_attribute);
+  cuda_utils.impl("get_device_attribute", &get_device_attribute);
  // Gets the maximum shared memory per block device attribute.
-  cuda_utils.def("get_max_shared_memory_per_block_device_attribute",
+  cuda_utils.def(
-                 &get_max_shared_memory_per_block_device_attribute);
+      "get_max_shared_memory_per_block_device_attribute(int device_id) -> int");
  cuda_utils.impl("get_max_shared_memory_per_block_device_attribute",
                  torch::kCUDA,
                  &get_max_shared_memory_per_block_device_attribute);
 }
 #ifndef USE_ROCM
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
  // Custom all-reduce kernels
-  custom_ar.def("init_custom_ar", &init_custom_ar);
+  custom_ar.def(
      "init_custom_ar(Tensor meta, Tensor rank_data, "
      "str[] handles, int[] offsets, int rank, "
      "bool full_nvlink) -> int");
  custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
-  custom_ar.def("should_custom_ar", &should_custom_ar);
+  custom_ar.def(
      "should_custom_ar(Tensor inp, int max_size, int world_size, "
      "bool full_nvlink) -> bool");
  custom_ar.impl("should_custom_ar", torch::kCUDA, &should_custom_ar);
  custom_ar.def("all_reduce_reg(int fa, Tensor inp, Tensor! out) -> ()");
@@ -358,21 +425,15 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
  custom_ar.impl("all_reduce_unreg", torch::kCUDA, &all_reduce_unreg);
  custom_ar.def("dispose", &dispose);
  custom_ar.impl("dispose", torch::kCPU, &dispose);
  custom_ar.def("meta_size", &meta_size);
  custom_ar.impl("meta_size", torch::kCPU, &meta_size);
-  custom_ar.def("register_buffer", &register_buffer);
+  custom_ar.def(
      "register_buffer(int fa, Tensor t, str[] handles, "
      "int[] offsets) -> ()");
  custom_ar.impl("register_buffer", torch::kCUDA, &register_buffer);
  custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
  custom_ar.impl("get_graph_buffer_ipc_meta", torch::kCPU,
                 &get_graph_buffer_ipc_meta);
  custom_ar.def("register_graph_buffers", &register_graph_buffers);
  custom_ar.impl("register_graph_buffers", torch::kCPU,
                 &register_graph_buffers);
 }
 #endif
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -11,6 +11,5 @@ pydantic >= 2.8
 torch
 py-cpuinfo
 transformers
 openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 mistral_common >= 1.3.4
 openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
--- a/docs/source/community/meetups.rst
+++ b/docs/source/community/meetups.rst
@@ -5,6 +5,7 @@ vLLM Meetups
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 - `The sixth vLLM meetup <https://lu.ma/87q3nvnh>`__, with NVIDIA, September 9th 2024. `[Slides] <https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing>`__
 - `The fifth vLLM meetup <https://lu.ma/lp0gyjqr>`__, with AWS, July 24th 2024. `[Slides] <https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing>`__
 - `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__
 - `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -99,6 +99,7 @@ autodoc_mock_imports = [
    "aiohttp",
    "compressed_tensors",
    "cpuinfo",
    "cv2",
    "torch",
    "transformers",
    "psutil",
--- a/docs/source/dev/profiling/profiling_index.rst
+++ b/docs/source/dev/profiling/profiling_index.rst
@@ -17,14 +17,28 @@ Traces can be visualized using https://ui.perfetto.dev/.
 .. tip::
   Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
-   
+
-Example commands:
+.. tip::
   To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
   Set the env variable VLLM_RPC_GET_DATA_TIMEOUT_MS to a big number before you start the server. Say something like 30 minutes.
   ``export VLLM_RPC_GET_DATA_TIMEOUT_MS=1800000``
 Example commands and usage:
 ===========================
 Offline Inference:
 ------------------
 Refer to `examples/offline_inference_with_profiler.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_with_profiler.py>`_ for an example.
 OpenAI Server:
 --------------
 .. code-block:: bash
-    VLLM_TORCH_PROFILER_DIR=/mnt/traces/ python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B 
+    VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B 
 benchmark_serving.py:
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -21,7 +21,7 @@ If you have already taken care of the above issues, but the vLLM instance still
 With more logging, hopefully you can find the root cause of the issue.
-If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the ``LLM`` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error.
+If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the :class:`~vllm.LLM` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error.
 Here are some common issues that can cause hangs:
--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
@@ -24,7 +24,9 @@ Offline Batched Inference
 We first show an example of using vLLM for offline batched inference on a dataset. In other words, we use vLLM to generate texts for a list of input prompts.
-Import ``LLM`` and ``SamplingParams`` from vLLM. The ``LLM`` class is the main class for running offline inference with vLLM engine. The ``SamplingParams`` class specifies the parameters for the sampling process.
+Import :class:`~vllm.LLM` and :class:`~vllm.SamplingParams` from vLLM.
 The :class:`~vllm.LLM` class is the main class for running offline inference with vLLM engine.
 The :class:`~vllm.SamplingParams` class specifies the parameters for the sampling process.
 .. code-block:: python
@@ -42,7 +44,7 @@ Define the list of input prompts and the sampling parameters for generation. The
    ]
    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-Initialize vLLM's engine for offline inference with the ``LLM`` class and the `OPT-125M model <https://arxiv.org/abs/2205.01068>`_. The list of supported models can be found at :ref:`supported models <supported_models>`.
+Initialize vLLM's engine for offline inference with the :class:`~vllm.LLM` class and the `OPT-125M model <https://arxiv.org/abs/2205.01068>`_. The list of supported models can be found at :ref:`supported models <supported_models>`.
 .. code-block:: python
--- a/docs/source/models/lora.rst
+++ b/docs/source/models/lora.rst
@@ -107,3 +107,55 @@ The following is an example request
            "max_tokens": 7,
            "temperature": 0
        }' | jq
 Dynamically serving LoRA Adapters
 ---------------------------------
 In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading
 LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility
 to change models on-the-fly is needed.
 Note: Enabling this feature in production environments is risky as user may participate model adapter management.
 To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
 is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active.
 .. code-block:: bash
    export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
 Loading a LoRA Adapter:
 To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary
 details of the adapter to be loaded. The request payload should include the name and path to the LoRA adapter.
 Example request to load a LoRA adapter:
 .. code-block:: bash
    curl -X POST http://localhost:8000/v1/load_lora_adapter \
    -H "Content-Type: application/json" \
    -d '{
        "lora_name": "sql_adapter",
        "lora_path": "/path/to/sql-lora-adapter"
    }'
 Upon a successful request, the API will respond with a 200 OK status code. If an error occurs, such as if the adapter
 cannot be found or loaded, an appropriate error message will be returned.
 Unloading a LoRA Adapter:
 To unload a LoRA adapter that has been previously loaded, send a POST request to the `/v1/unload_lora_adapter` endpoint
 with the name or ID of the adapter to be unloaded.
 Example request to unload a LoRA adapter:
 .. code-block:: bash
    curl -X POST http://localhost:8000/v1/unload_lora_adapter \
    -H "Content-Type: application/json" \
    -d '{
        "lora_name": "sql_adapter"
    }'
--- a/docs/source/models/spec_decode.rst
+++ b/docs/source/models/spec_decode.rst
@@ -161,6 +161,46 @@ A variety of speculative models of this type are available on HF hub:
 * `granite-7b-instruct-accelerator <https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator>`_
 * `granite-20b-code-instruct-accelerator <https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator>`_
 Lossless guarantees of Speculative Decoding
 -------------------------------------------
 In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of 
 speculative decoding, breaking down the guarantees into three key areas:
 1. **Theoretical Losslessness**
   - Speculative decoding sampling is theoretically lossless up to the precision limits of hardware numerics. Floating-point errors might 
   cause slight variations in output distributions, as discussed 
   in `Accelerating Large Language Model Decoding with Speculative Sampling <https://arxiv.org/pdf/2302.01318>`_
 2. **Algorithmic Losslessness**
   - vLLM’s implementation of speculative decoding is algorithmically validated to be lossless. Key validation tests include:
    - **Rejection Sampler Convergence**: Ensures that samples from vLLM’s rejection sampler align with the target 
      distribution. `View Test Code <https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252>`_
    - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling
      without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler, 
      provides a lossless guarantee.  Almost all of the tests in `this directory <https://github.com/vllm-project/vllm/tree/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e>`_
      verify this property using `this assertion implementation <https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291>`_
 3. **vLLM Logprob Stability**
   - vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the 
   same request across runs. For more details, see the FAQ section 
   titled *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq.rst>`_.
 **Conclusion**
 While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding 
 can occur due to following factors:
 - **Floating-Point Precision**: Differences in hardware numerical precision may lead to slight discrepancies in the output distribution.
 - **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially 
  due to non-deterministic behavior in batched operations or numerical instability.
 **Mitigation Strategies**
 For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq.rst>`_.
 Resources for vLLM contributors
 -------------------------------
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -194,12 +194,12 @@ Multimodal Language Models
  * - Architecture
    - Models
-    - Supported Modalities
+    - Modalities
    - Example HuggingFace Models
    - :ref:`LoRA <lora>`
  * - :code:`Blip2ForConditionalGeneration`
    - BLIP-2
-    - Image
+    - Image\ :sup:`E`
    - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc.
    -
  * - :code:`ChameleonForConditionalGeneration`
@@ -214,44 +214,75 @@ Multimodal Language Models
    - 
  * - :code:`InternVLChatModel`
    - InternVL2
-    - Image
+    - Image\ :sup:`E+`
    - :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc.
    - 
  * - :code:`LlavaForConditionalGeneration`
    - LLaVA-1.5
-    - Image
+    - Image\ :sup:`E+`
    - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
    -
  * - :code:`LlavaNextForConditionalGeneration`
    - LLaVA-NeXT
-    - Image
+    - Image\ :sup:`E+`
    - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
    -
  * - :code:`LlavaNextVideoForConditionalGeneration`
    - LLaVA-NeXT-Video
    - Video
    - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. (see note)
    -
  * - :code:`MiniCPMV`
    - MiniCPM-V
    - Image\ :sup:`+`
    - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc.
    -
  * - :code:`PaliGemmaForConditionalGeneration`
    - PaliGemma
-    - Image
+    - Image\ :sup:`E`
    - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc.
    - 
  * - :code:`Phi3VForCausalLM`
    - Phi-3-Vision, Phi-3.5-Vision
-    - Image
+    - Image\ :sup:`E+`
    - :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc.
    -
-  * - :code:`MiniCPMV`
+  * - :code:`PixtralForConditionalGeneration`
-    - MiniCPM-V
+    - Pixtral
-    - Image
+    - Image\ :sup:`+`
-    - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc.
+    - :code:`mistralai/Pixtral-12B-2409`
    -
  * - :code:`QWenLMHeadModel`
    - Qwen-VL
    - Image\ :sup:`E+`
    - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
    -
  * - :code:`Qwen2VLForConditionalGeneration`
    - Qwen2-VL (see note)
    - Image\ :sup:`+` / Video\ :sup:`+`
    - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
    -
  * - :code:`UltravoxModel`
    - Ultravox
-    - Audio
+    - Audio\ :sup:`E+`
    - :code:`fixie-ai/ultravox-v0_3`
    -
 | :sup:`E` Pre-computed embeddings can be inputted for this modality.
 | :sup:`+` Multiple items can be inputted per text prompt for this modality.
 .. note::
  For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
  For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
 .. note::
  For :code:`LLaVA-NeXT-Video` and :code:`Qwen2-VL`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now.
  This can be installed by running the following command: 
  .. code-block:: bash
    pip install git+https://github.com/huggingface/transformers.git@21fac7abba2a37fae86106f87fcf9974fd1e3830
 ----
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -9,26 +9,23 @@ This document shows you how to run and serve these models using vLLM.
 .. important::
    We are actively iterating on VLM support. Expect breaking changes to VLM usage and development in upcoming releases without prior deprecation.
    Currently, the support for vision language models on vLLM has the following limitations:
    * Only single image input is supported per text prompt.
    We are continuously improving user & developer experience for VLMs. Please `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_ if you have any feedback or feature requests.
-Offline Batched Inference
+Offline Inference
-------------------------
+-----------------
-To initialize a VLM, the aforementioned arguments must be passed to the ``LLM`` class for instantiating the engine.
+Single-image input
 ^^^^^^^^^^^^^^^^^^
 The :class:`~vllm.LLM` class can be instantiated in much the same way as language-only models.
 .. code-block:: python
    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
-.. important::
+.. note::
    We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
-    the above snippet. Specifically, ``image_feature_size`` is no longer required to be specified as we now calculate that
+    the above snippet. Specifically, ``image_feature_size`` can no longer be specified as we now calculate that internally for each model.
    internally for each model.
 To pass an image to the model, note the following in :class:`vllm.inputs.PromptInputs`:
@@ -86,61 +83,117 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptI
 A code example can be found in `examples/offline_inference_vision_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py>`_.
 Multi-image input
 ^^^^^^^^^^^^^^^^^
-Online OpenAI Vision API Compatible Inference
+Multi-image input is only supported for a subset of VLMs, as shown :ref:`here <supported_vlms>`.
----------------------------------------------
+
 To enable multiple multi-modal items per text prompt, you have to set ``limit_mm_per_prompt`` for the :class:`~vllm.LLM` class.
 .. code-block:: python
    llm = LLM(
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,  # Required to load Phi-3.5-vision
        max_model_len=4096,  # Otherwise, it may not fit in smaller GPUs
        limit_mm_per_prompt={"image": 2},  # The maximum number to accept
    )
 Instead of passing in a single image, you can pass in a list of images.
 .. code-block:: python
    # Refer to the HuggingFace repo for the correct format to use
    prompt = "<|user|>\n<image_1>\n<image_2>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
    # Load the images using PIL.Image
    image1 = PIL.Image.open(...)
    image2 = PIL.Image.open(...)
    outputs = llm.generate({
        "prompt": prompt,
        "multi_modal_data": {
            "image": [image1, image2]
        },
    })
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
 A code example can be found in `examples/offline_inference_vision_language_multi_image.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_multi_image.py>`_.
 Online Inference
 ----------------
 OpenAI Vision API
 ^^^^^^^^^^^^^^^^^
 You can serve vision language models with vLLM's HTTP server that is compatible with `OpenAI Vision API <https://platform.openai.com/docs/guides/vision>`_.
-.. note::
+Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruct`` with vLLM's OpenAI-compatible API server.
    Currently, vLLM supports only **single** ``image_url`` input per ``messages``. Support for multi-image inputs will be
    added in the future.
 Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with vLLM API server.
 .. important::
    Since OpenAI Vision API is based on `Chat <https://platform.openai.com/docs/api-reference/chat>`_ API, a chat template 
    is **required** to launch the API server if the model's tokenizer does not come with one. In this example, we use the 
    HuggingFace Llava chat template that you can find in the example folder `here <https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja>`_.
 .. code-block:: bash
-    vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
+    vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
      --trust-remote-code --limit-mm-per-prompt image=2
 .. important::
-    We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
+    Since OpenAI Vision API is based on `Chat Completions <https://platform.openai.com/docs/api-reference/chat>`_ API,
-    the above snippet. Specifically, ``image_feature_size`` is no longer required to be specified as we now calculate that
+    a chat template is **required** to launch the API server.
-    internally for each model.
+
    Although Phi-3.5-Vision comes with a chat template, for other models you may have to provide one if the model's tokenizer does not come with it.
    The chat template can be inferred based on the documentation on the model's HuggingFace repo.
    For example, LLaVA-1.5 (``llava-hf/llava-1.5-7b-hf``) requires a chat template that can be found `here <https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja>`_.
 To consume the server, you can use the OpenAI client like in the example below:
 .. code-block:: python
    from openai import OpenAI
    openai_api_key = "EMPTY"
    openai_api_base = "http://localhost:8000/v1"
    client = OpenAI(
        api_key=openai_api_key,
        base_url=openai_api_base,
    )
    # Single-image input inference
    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
    chat_response = client.chat.completions.create(
-        model="llava-hf/llava-1.5-7b-hf",
+        model="microsoft/Phi-3.5-vision-instruct",
        messages=[{
            "role": "user",
            "content": [
                # NOTE: The prompt formatting with the image token `<image>` is not needed
                # since the prompt will be processed automatically by the API server.
-                {"type": "text", "text": "What's in this image?"},
+                {"type": "text", "text": "What’s in this image?"},
-                {
+                {"type": "image_url", "image_url": {"url": image_url}},
                    "type": "image_url",
                    "image_url": {
                        "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
                    },
                },
            ],
        }],
    )
-    print("Chat response:", chat_response)
+    print("Chat completion output:", chat_response.choices[0].message.content)
    # Multi-image input inference
    image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
    image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
    chat_response = client.chat.completions.create(
        model="microsoft/Phi-3.5-vision-instruct",
        messages=[{
            "role": "user",
            "content": [
                {"type": "text", "text": "What are the animals in these images?"},
                {"type": "image_url", "image_url": {"url": image_url_duck}},
                {"type": "image_url", "image_url": {"url": image_url_lion}},
            ],
        }],
    )
    print("Chat completion output:", chat_response.choices[0].message.content)
 A full code example can be found in `examples/openai_vision_api_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_vision_api_client.py>`_.
--- a/docs/source/quantization/supported_hardware.rst
+++ b/docs/source/quantization/supported_hardware.rst
@@ -119,17 +119,6 @@ The table below shows the compatibility of various quantization implementations
     - ✗
     - ✗
     - ✗
   * - SqueezeLLM
     - ✅︎
     - ✅︎
     - ✅︎
     - ✅︎
     - ✅︎
     - ✗
     - ✗
     - ✗
     - ✗
     - ✗
 Notes:
 ^^^^^^
--- a/docs/source/serving/faq.rst
+++ b/docs/source/serving/faq.rst
@@ -10,3 +10,22 @@ A: Assuming that you're referring to using OpenAI compatible server to serve mul
    Q: Which model to use for offline inference embedding?
 A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model
 ----------------------------------------
    Q: Can the output of a prompt vary across runs in vLLM?
 A: Yes, it can. vLLM does not guarantee stable log probabilities (logprobs) for the output tokens. Variations in logprobs may occur due to
 numerical instability in Torch operations or non-deterministic behavior in batched Torch operations when batching changes. For more details, 
 see the `Numerical Accuracy section <https://pytorch.org/docs/stable/notes/numerical_accuracy.html#batched-computations-or-slice-computations>`_.
 In vLLM, the same requests might be batched differently due to factors such as other concurrent requests,
 changes in batch size, or batch expansion in speculative decoding. These batching variations, combined with numerical instability of Torch operations, 
 can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in 
 different tokens being sampled. Once a different token is sampled, further divergence is likely.
 **Mitigation Strategies**
 - For improved stability and reduced variance, use `float32`. Note that this will require more memory.
 - If using `bfloat16`, switching to `float16` can also help.
 - Using request seeds can aid in achieving more stable generation for temperature > 0, but discrepancies due to precision differences may still occur.
--- a/examples/fp8/README.md
+++ b/examples/fp8/README.md
@@ -62,7 +62,7 @@ This script evaluates the inference throughput of language models using various
 python3 benchmarks/benchmark_throughput.py --help 
 usage: benchmark_throughput.py [-h] [--backend {vllm,hf,mii}] [--dataset DATASET] [--input-len INPUT_LEN] [--output-len OUTPUT_LEN] [--model MODEL]
-                               [--tokenizer TOKENIZER] [--quantization {awq,gptq,squeezellm,None}] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--n N]
+                               [--tokenizer TOKENIZER] [--quantization {awq,gptq,None}] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--n N]
                               [--use-beam-search] [--num-prompts NUM_PROMPTS] [--seed SEED] [--hf-max-batch-size HF_MAX_BATCH_SIZE] [--trust-remote-code]
                               [--max-model-len MAX_MODEL_LEN] [--dtype {auto,half,float16,bfloat16,float,float32}] [--enforce-eager] [--kv-cache-dtype {auto,fp8}]
                               [--quantization-param-path KV_CACHE_quantization_param_path]
@@ -76,7 +76,7 @@ optional arguments:
  --output-len OUTPUT_LEN  Output length for each request. Overrides the output length from the dataset.
  --model MODEL
  --tokenizer TOKENIZER
-  --quantization {awq,gptq,squeezellm,None}, -q {awq,gptq,squeezellm,None}
+  --quantization {awq,gptq,None}, -q {awq,gptq,None}
  --tensor-parallel-size TENSOR_PARALLEL_SIZE, -tp TENSOR_PARALLEL_SIZE
  --n N  Number of generated sequences per prompt.
  --use-beam-search
--- a/examples/fp8/quantizer/README.md
+++ b/examples/fp8/quantizer/README.md
@@ -1,6 +1,6 @@
 ### Quantizer Utilities
-`quantize.py`: NVIDIA Quantization utilities using AMMO, ported from TensorRT-LLM:
+`quantize.py`: NVIDIA Quantization utilities using TensorRT-Model-Optimizer, ported
-`https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py`
+from TensorRT-LLM: [`examples/quantization/quantize.py`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py)
 ### Prerequisite
--- a/examples/offline_inference_pixtral.py
+++ b/examples/offline_inference_pixtral.py
@@ -0,0 +1,165 @@
 # ruff: noqa
 import argparse
 from vllm import LLM
 from vllm.sampling_params import SamplingParams
 # This script is an offline demo for running Pixtral.
 #
 # If you want to run a server/client setup, please follow this code:
 #
 # - Server:
 #
 # ```bash
 # vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
 # ```
 #
 # - Client:
 #
 # ```bash
 # curl --location 'http://<your-node-url>:8000/v1/chat/completions' \
 # --header 'Content-Type: application/json' \
 # --header 'Authorization: Bearer token' \
 # --data '{
 #     "model": "mistralai/Pixtral-12B-2409",
 #     "messages": [
 #       {
 #         "role": "user",
 #         "content": [
 #             {"type" : "text", "text": "Describe this image in detail please."},
 #             {"type": "image_url", "image_url": {"url": "https://s3.amazonaws.com/cms.ipressroom.com/338/files/201808/5b894ee1a138352221103195_A680%7Ejogging-edit/A680%7Ejogging-edit_hero.jpg"}},
 #             {"type" : "text", "text": "and this one as well. Answer in French."},
 #             {"type": "image_url", "image_url": {"url": "https://www.wolframcloud.com/obj/resourcesystem/images/a0e/a0ee3983-46c6-4c92-b85d-059044639928/6af8cfb971db031b.png"}}
 #         ]
 #       }
 #     ]
 #   }'
 # ```
 #
 # Usage:
 #     python demo.py simple
 #     python demo.py advanced
 def run_simple_demo():
    model_name = "mistralai/Pixtral-12B-2409"
    sampling_params = SamplingParams(max_tokens=8192)
    # Lower max_num_seqs or max_model_len on low-VRAM GPUs.
    llm = LLM(model=model_name, tokenizer_mode="mistral")
    prompt = "Describe this image in one sentence."
    image_url = "https://picsum.photos/id/237/200/300"
    messages = [
        {
            "role":
            "user",
            "content": [
                {
                    "type": "text",
                    "text": prompt
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": image_url
                    }
                },
            ],
        },
    ]
    outputs = llm.chat(messages, sampling_params=sampling_params)
    print(outputs[0].outputs[0].text)
 def run_advanced_demo():
    model_name = "mistralai/Pixtral-12B-2409"
    max_img_per_msg = 5
    max_tokens_per_img = 4096
    sampling_params = SamplingParams(max_tokens=8192, temperature=0.7)
    llm = LLM(
        model=model_name,
        tokenizer_mode="mistral",
        limit_mm_per_prompt={"image": max_img_per_msg},
        max_model_len=max_img_per_msg * max_tokens_per_img,
    )
    prompt = "Describe the following image."
    url_1 = "https://huggingface.co/datasets/patrickvonplaten/random_img/resolve/main/yosemite.png"
    url_2 = "https://picsum.photos/seed/picsum/200/300"
    url_3 = "https://picsum.photos/id/32/512/512"
    messages = [
        {
            "role":
            "user",
            "content": [
                {
                    "type": "text",
                    "text": prompt
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": url_1
                    }
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": url_2
                    }
                },
            ],
        },
        {
            "role": "assistant",
            "content": "The images show nature.",
        },
        {
            "role": "user",
            "content": "More details please and answer only in French!.",
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": url_3
                    }
                },
            ],
        },
    ]
    outputs = llm.chat(messages=messages, sampling_params=sampling_params)
    print(outputs[0].outputs[0].text)
 def main():
    parser = argparse.ArgumentParser(
        description="Run a demo in simple or advanced mode.")
    parser.add_argument(
        "mode",
        choices=["simple", "advanced"],
        help="Specify the demo mode: 'simple' or 'advanced'",
    )
    args = parser.parse_args()
    if args.mode == "simple":
        print("Running simple demo...")
        run_simple_demo()
    elif args.mode == "advanced":
        print("Running advanced demo...")
        run_advanced_demo()
 if __name__ == "__main__":
    main()
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -9,12 +9,9 @@ from transformers import AutoTokenizer
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.utils import FlexibleArgumentParser
 # Input image and question
 image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
 question = "What is the content of this image?"
 # LLaVA-1.5
 def run_llava(question):
@@ -30,7 +27,16 @@ def run_llava(question):
 def run_llava_next(question):
    prompt = f"[INST] <image>\n{question} [/INST]"
-    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf")
+    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
    stop_token_ids = None
    return llm, prompt, stop_token_ids
 # LlaVA-NeXT-Video
 # Currently only support for video input
 def run_llava_next_video(question):
    prompt = f"USER: <video>\n{question} ASSISTANT:"
    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
    stop_token_ids = None
    return llm, prompt, stop_token_ids
@@ -159,9 +165,41 @@ def run_blip2(question):
    return llm, prompt, stop_token_ids
 # Qwen
 def run_qwen_vl(question):
    llm = LLM(
        model="Qwen/Qwen-VL",
        trust_remote_code=True,
        max_num_seqs=5,
    )
    prompt = f"{question}Picture 1: <img></img>\n"
    stop_token_ids = None
    return llm, prompt, stop_token_ids
 # Qwen2-VL
 def run_qwen2_vl(question):
    model_name = "Qwen/Qwen2-VL-7B-Instruct"
    llm = LLM(
        model=model_name,
        max_num_seqs=5,
    )
    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
              "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
              f"{question}<|im_end|>\n"
              "<|im_start|>assistant\n")
    stop_token_ids = None
    return llm, prompt, stop_token_ids
 model_example_map = {
    "llava": run_llava,
    "llava-next": run_llava_next,
    "llava-next-video": run_llava_next_video,
    "fuyu": run_fuyu,
    "phi3_v": run_phi3v,
    "paligemma": run_paligemma,
@@ -169,14 +207,54 @@ model_example_map = {
    "minicpmv": run_minicpmv,
    "blip-2": run_blip2,
    "internvl_chat": run_internvl,
    "qwen_vl": run_qwen_vl,
    "qwen2_vl": run_qwen2_vl,
 }
 def get_multi_modal_input(args):
    """
    return {
        "data": image or video,
        "question": question,
    }
    """
    if args.modality == "image":
        # Input image and question
        image = ImageAsset("cherry_blossom") \
            .pil_image.convert("RGB")
        img_question = "What is the content of this image?"
        return {
            "data": image,
            "question": img_question,
        }
    if args.modality == "video":
        # Input video and question
        video = VideoAsset(name="sample_demo_1.mp4",
                           num_frames=args.num_frames).np_ndarrays
        vid_question = "Why is this video funny?"
        return {
            "data": video,
            "question": vid_question,
        }
    msg = f"Modality {args.modality} is not supported."
    raise ValueError(msg)
 def main(args):
    model = args.model_type
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")
    modality = args.modality
    mm_input = get_multi_modal_input(args)
    data = mm_input["data"]
    question = mm_input["question"]
    llm, prompt, stop_token_ids = model_example_map[model](question)
    # We set temperature to 0.2 so that outputs can be different
@@ -191,7 +269,7 @@ def main(args):
        inputs = {
            "prompt": prompt,
            "multi_modal_data": {
-                "image": image
+                modality: data
            },
        }
@@ -200,7 +278,7 @@ def main(args):
        inputs = [{
            "prompt": prompt,
            "multi_modal_data": {
-                "image": image
+                modality: data
            },
        } for _ in range(args.num_prompts)]
@@ -223,8 +301,15 @@ if __name__ == "__main__":
                        help='Huggingface "model_type".')
    parser.add_argument('--num-prompts',
                        type=int,
-                        default=1,
+                        default=4,
                        help='Number of prompts to run.')
-
+    parser.add_argument('--modality',
                        type=str,
                        default="image",
                        help='Modality of the input.')
    parser.add_argument('--num-frames',
                        type=int,
                        default=16,
                        help='Number of frames to extract from the video.')
    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -0,0 +1,243 @@
 """
 This example shows how to use vLLM for running offline inference with
 multi-image input on vision language models, using the chat template defined
 by the model.
 """
 from argparse import Namespace
 from typing import List
 from transformers import AutoProcessor, AutoTokenizer
 from vllm import LLM, SamplingParams
 from vllm.multimodal.utils import fetch_image
 from vllm.utils import FlexibleArgumentParser
 QUESTION = "What is the content of each image?"
 IMAGE_URLS = [
    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
 ]
 def load_qwenvl_chat(question: str, image_urls: List[str]):
    model_name = "Qwen/Qwen-VL-Chat"
    llm = LLM(
        model=model_name,
        trust_remote_code=True,
        max_num_seqs=5,
        limit_mm_per_prompt={"image": len(image_urls)},
    )
    placeholders = "".join(f"Picture {i}: <img></img>\n"
                           for i, _ in enumerate(image_urls, start=1))
    # This model does not have a chat_template attribute on its tokenizer,
    # so we need to explicitly pass it. We use ChatML since it's used in the
    # generation utils of the model:
    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501
    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
    prompt = tokenizer.apply_chat_template(messages,
                                           tokenize=False,
                                           add_generation_prompt=True,
                                           chat_template=chat_template)
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
    return llm, prompt, stop_token_ids, None, chat_template
 def load_phi3v(question: str, image_urls: List[str]):
    llm = LLM(
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": len(image_urls)},
    )
    placeholders = "\n".join(f"<|image_{i}|>"
                             for i, _ in enumerate(image_urls, start=1))
    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
    stop_token_ids = None
    return llm, prompt, stop_token_ids, None, None
 def load_internvl(question: str, image_urls: List[str]):
    model_name = "OpenGVLab/InternVL2-2B"
    llm = LLM(
        model=model_name,
        trust_remote_code=True,
        max_num_seqs=5,
        max_model_len=4096,
        limit_mm_per_prompt={"image": len(image_urls)},
    )
    placeholders = "\n".join(f"Image-{i}: <image>\n"
                             for i, _ in enumerate(image_urls, start=1))
    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(messages,
                                           tokenize=False,
                                           add_generation_prompt=True)
    # Stop tokens for InternVL
    # models variants may have different stop tokens
    # please refer to the model card for the correct "stop words":
    # https://huggingface.co/OpenGVLab/InternVL2-2B#service
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
    return llm, prompt, stop_token_ids, None, None
 def load_qwen2_vl(question, image_urls: List[str]):
    try:
        from qwen_vl_utils import process_vision_info
    except ModuleNotFoundError:
        print('WARNING: `qwen-vl-utils` not installed, input images will not '
              'be automatically resized. You can enable this functionality by '
              '`pip install qwen-vl-utils`.')
        process_vision_info = None
    model_name = "Qwen/Qwen2-VL-7B-Instruct"
    llm = LLM(
        model=model_name,
        max_num_seqs=5,
        max_model_len=32768 if process_vision_info is None else 4096,
        limit_mm_per_prompt={"image": len(image_urls)},
    )
    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [{
        "role": "system",
        "content": "You are a helpful assistant."
    }, {
        "role":
        "user",
        "content": [
            *placeholders,
            {
                "type": "text",
                "text": question
            },
        ],
    }]
    processor = AutoProcessor.from_pretrained(model_name)
    prompt = processor.apply_chat_template(messages,
                                           tokenize=False,
                                           add_generation_prompt=True)
    stop_token_ids = None
    if process_vision_info is None:
        image_data = [fetch_image(url) for url in image_urls]
    else:
        image_data, _ = process_vision_info(messages)
    return llm, prompt, stop_token_ids, image_data, None
 model_example_map = {
    "phi3_v": load_phi3v,
    "internvl_chat": load_internvl,
    "qwen2_vl": load_qwen2_vl,
    "qwen_vl_chat": load_qwenvl_chat,
 }
 def run_generate(model, question: str, image_urls: List[str]):
    llm, prompt, stop_token_ids, image_data, _ = model_example_map[model](
        question, image_urls)
    if image_data is None:
        image_data = [fetch_image(url) for url in image_urls]
    sampling_params = SamplingParams(temperature=0.0,
                                     max_tokens=128,
                                     stop_token_ids=stop_token_ids)
    outputs = llm.generate(
        {
            "prompt": prompt,
            "multi_modal_data": {
                "image": image_data
            },
        },
        sampling_params=sampling_params)
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
 def run_chat(model: str, question: str, image_urls: List[str]):
    llm, _, stop_token_ids, _, chat_template = model_example_map[model](
        question, image_urls)
    sampling_params = SamplingParams(temperature=0.0,
                                     max_tokens=128,
                                     stop_token_ids=stop_token_ids)
    outputs = llm.chat(
        [{
            "role":
            "user",
            "content": [
                {
                    "type": "text",
                    "text": question,
                },
                *({
                    "type": "image_url",
                    "image_url": {
                        "url": image_url
                    },
                } for image_url in image_urls),
            ],
        }],
        sampling_params=sampling_params,
        chat_template=chat_template,
    )
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
 def main(args: Namespace):
    model = args.model_type
    method = args.method
    if method == "generate":
        run_generate(model, QUESTION, IMAGE_URLS)
    elif method == "chat":
        run_chat(model, QUESTION, IMAGE_URLS)
    else:
        raise ValueError(f"Invalid method: {method}")
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description='Demo on using vLLM for offline inference with '
        'vision language models that support multi-image input')
    parser.add_argument('--model-type',
                        '-m',
                        type=str,
                        default="phi3_v",
                        choices=model_example_map.keys(),
                        help='Huggingface "model_type".')
    parser.add_argument("--method",
                        type=str,
                        default="generate",
                        choices=["generate", "chat"],
                        help="The method to run in `vllm.LLM`.")
    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference_with_profiler.py
+++ b/examples/offline_inference_with_profiler.py
@@ -0,0 +1,33 @@
 import os
 from vllm import LLM, SamplingParams
 # enable torch profiler, can also be set on cmd line
 os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile"
 # Sample prompts.
 prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
 ]
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 # Create an LLM.
 llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
 llm.start_profile()
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
 llm.stop_profile()
 # Print the outputs.
 for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/examples/openai_vision_api_client.py
+++ b/examples/openai_vision_api_client.py
@@ -27,9 +27,10 @@ client = OpenAI(
 models = client.models.list()
 model = models.data[0].id
 # Single-image input inference
 image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-# Use image url in the payload
+## Use image url in the payload
 chat_completion_from_url = client.chat.completions.create(
    messages=[{
        "role":
@@ -52,10 +53,10 @@ chat_completion_from_url = client.chat.completions.create(
 )
 result = chat_completion_from_url.choices[0].message.content
-print(f"Chat completion output:{result}")
+print("Chat completion output:", result)
-# Use base64 encoded image in the payload
+## Use base64 encoded image in the payload
 def encode_image_base64_from_url(image_url: str) -> str:
    """Encode an image retrieved from a remote url to base64 format."""
@@ -122,4 +123,4 @@ chat_completion_from_url = client.chat.completions.create(
 )
 result = chat_completion_from_url.choices[0].message.content
-print(f"Chat completion output:{result}")
+print("Chat completion output:", result)
--- a/examples/tool_chat_template_hermes.jinja
+++ b/examples/tool_chat_template_hermes.jinja
@@ -89,22 +89,23 @@
        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
    {%- elif message.role == "assistant" and message.tool_calls is defined %}
        {{- '<|im_start|>' + message.role }}
-            {%- for tool_call in message.tool_calls %}
+        {%- for tool_call in message.tool_calls %}
-                {{- '\n<tool_call>\n' }}
+            {{- '\n<tool_call>\n' }}
-                {%- if tool_call.function is defined %}
+            {%- if tool_call.function is defined %}
-                    {%- set tool_call = tool_call.function %}
+                {%- set tool_call = tool_call.function %}
-                {%- endif %}
+            {%- endif %}
-                {{- '{' }}
+            {{- '{' }}
-                {{- '"name": "' }}
+            {{- '"name": "' }}
-                {{- tool_call.name }}
+            {{- tool_call.name }}
-                {{- '"}' }}
+            {{- '"' }}
            {%- if tool_call.arguments is defined %}
                {{- ', ' }}
-                {%- if tool_call.arguments is defined %}
+                {{- '"arguments": ' }}
-                    {{- '"arguments": ' }}
+                {{- tool_call.arguments|tojson }}
-                    {{- tool_call.arguments|tojson }}
+            {%- endif %}
-                {%- endif %}
+            {{- '}' }}
-                {{- '\n</tool_call>' }}
+            {{- '\n</tool_call>' }}
-            {%- endfor %}
+        {%- endfor %}
        {{- '<|im_end|>\n' }}
    {%- elif message.role == "tool" %}
        {%- if loop.previtem and loop.previtem.role != "tool" %}
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,7 +76,7 @@ exclude = [
 [tool.codespell]
 ignore-words-list = "dout, te, indicies, subtile"
-skip = "./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
+skip = "./tests/models/fixtures,./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
 [tool.isort]
 use_parentheses = true
--- a/requirements-adag.txt
+++ b/requirements-adag.txt
@@ -1,3 +0,0 @@
 # Dependencies for Ray accelerated DAG
 cupy-cuda12x
 ray >= 2.32
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -7,11 +7,11 @@ py-cpuinfo
 transformers >= 4.43.2  # Required for Chameleon and Llama 3.1 hotfox.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
-fastapi
+fastapi >= 0.114.1
 aiohttp
-openai >= 1.0 # Ensure modern openai package (ensure types module present)
+openai >= 1.40.0 # Ensure modern openai package (ensure types module present)
 uvicorn[standard]
-pydantic >= 2.8  # Required for OpenAI server.
+pydantic >= 2.9  # Required for fastapi >= 0.113.0
 pillow  # Required for image processing
 prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
@@ -25,5 +25,7 @@ pyzmq
 msgspec
 gguf == 0.9.1
 importlib_metadata
-mistral_common >= 1.3.4
+mistral_common >= 1.4.0
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 einops # Required for Qwen2-VL.
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,6 +1,3 @@
 # Needed for Ray accelerated DAG tests
 -r requirements-adag.txt
 # testing
 pytest
 tensorizer>=2.9.0
@@ -14,9 +11,10 @@ awscli
 einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio test
 opencv-python # required for video test
 peft
 requests
-ray
+ray[adag]>=2.35
 sentence-transformers # required for embedding
 soundfile # required for audio test
 compressed-tensors==0.4.0 # required for compressed-tensors
--- a/setup.py
+++ b/setup.py
@@ -170,14 +170,17 @@ class cmake_build_ext(build_ext):
        if is_sccache_available():
            cmake_args += [
                '-DCMAKE_C_COMPILER_LAUNCHER=sccache',
                '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
                '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
-                '-DCMAKE_C_COMPILER_LAUNCHER=sccache',
+                '-DCMAKE_HIP_COMPILER_LAUNCHER=sccache',
            ]
        elif is_ccache_available():
            cmake_args += [
                '-DCMAKE_C_COMPILER_LAUNCHER=ccache',
                '-DCMAKE_CXX_COMPILER_LAUNCHER=ccache',
                '-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache',
                '-DCMAKE_HIP_COMPILER_LAUNCHER=ccache',
            ]
        # Pass the python executable to cmake so it can find an exact
@@ -502,6 +505,7 @@ setup(
    ext_modules=ext_modules,
    extras_require={
        "tensorizer": ["tensorizer>=2.9.0"],
        "video": ["opencv-python"],  # Required for video processing
        "audio": ["librosa", "soundfile"]  # Required for audio processing
    },
    cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -1,4 +1,3 @@
 import os
 import subprocess
 import sys
 import time
@@ -26,8 +25,7 @@ def _query_server_long(prompt: str) -> dict:
@pytest.fixture
-def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
+def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
               worker_use_ray: bool):
    script_path = Path(__file__).parent.joinpath(
        "api_server_async_engine.py").absolute()
    commands = [
@@ -37,25 +35,17 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
        str(tokenizer_pool_size)
    ]
    # Copy the environment variables and append `VLLM_ALLOW_ENGINE_USE_RAY=1`
    # to prevent `--engine-use-ray` raises an exception due to it deprecation
    env_vars = os.environ.copy()
    env_vars["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
    if engine_use_ray:
        commands.append("--engine-use-ray")
    if worker_use_ray:
        commands.append("--worker-use-ray")
-    uvicorn_process = subprocess.Popen(commands, env=env_vars)
+    uvicorn_process = subprocess.Popen(commands)
    yield
    uvicorn_process.terminate()
@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
@pytest.mark.parametrize("worker_use_ray", [False, True])
-@pytest.mark.parametrize("engine_use_ray", [False, True])
+def test_api_server(api_server, tokenizer_pool_size: int,
-def test_api_server(api_server, tokenizer_pool_size: int, worker_use_ray: bool,
+                    worker_use_ray: bool):
                    engine_use_ray: bool):
    """
    Run the API server and test it.
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -1,8 +1,10 @@
 import asyncio
 import os
 import uuid
 from asyncio import CancelledError
 from copy import copy
 from dataclasses import dataclass
-from typing import Optional
+from typing import List, Optional
 import pytest
 import pytest_asyncio
@@ -12,6 +14,7 @@ from vllm import SamplingParams
 from vllm.config import ParallelConfig
 from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
 from vllm.outputs import RequestOutput as RealRequestOutput
 from vllm.sampling_params import RequestOutputKind
 from ..conftest import cleanup
 from ..utils import wait_for_gpu_memory_to_clear
@@ -72,14 +75,12 @@ class MockEngine:
 class MockAsyncLLMEngine(AsyncLLMEngine):
-
+    _engine_class = MockEngine
    def _init_engine(self, *args, **kwargs):
        return MockEngine()
@pytest.mark.asyncio
 async def test_new_requests_event():
-    engine = MockAsyncLLMEngine(worker_use_ray=False, engine_use_ray=False)
+    engine = MockAsyncLLMEngine(worker_use_ray=False)
    engine.start_background_loop()
    await asyncio.sleep(0.01)
    assert engine.engine.step_calls == 0
@@ -112,16 +113,11 @@ async def test_new_requests_event():
    assert engine.engine.add_request_calls == 3
    assert engine.engine.step_calls == old_step_calls + 1
-    # Allow deprecated engine_use_ray to not raise exception
+    engine = MockAsyncLLMEngine(worker_use_ray=True)
    os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
    engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)
    assert engine.get_model_config() is not None
    assert engine.get_tokenizer() is not None
    assert engine.get_decoding_config() is not None
    os.environ.pop("VLLM_ALLOW_ENGINE_USE_RAY")
 def start_engine():
    wait_for_gpu_memory_to_clear(
@@ -130,8 +126,17 @@ def start_engine():
        timeout_s=60,
    )
    num_scheduler_steps = int(os.getenv("NUM_SCHEDULER_STEPS", "1"))
    print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")
    return AsyncLLMEngine.from_engine_args(
-        AsyncEngineArgs(model="facebook/opt-125m", enforce_eager=True))
+        AsyncEngineArgs(model="facebook/opt-125m",
                        enforce_eager=True,
                        num_scheduler_steps=num_scheduler_steps))
 def uid() -> str:
    return str(uuid.uuid4())
@pytest_asyncio.fixture(scope="module")
@@ -156,57 +161,177 @@ def should_do_global_cleanup_after_test(request) -> bool:
@pytest.mark.asyncio(scope="module")
 async def test_asyncio_run(async_engine):
    scheduler_config = await async_engine.get_scheduler_config()
    num_scheduler_steps = scheduler_config.num_scheduler_steps
    async def run(prompt: str):
        sampling_params = SamplingParams(
            temperature=0,
            max_tokens=32,
            min_tokens=32,
        )
        output_count = 0
        final_output = None
        async for output in async_engine.generate(prompt,
                                                  sampling_params,
-                                                  request_id=prompt):
+                                                  request_id=uid()):
            output_count += 1
            final_output = output
-        return final_output
+        return final_output, output_count
    results = await asyncio.gather(
        run("test0"),
-        run("test1"),
+        run("test0"),
    )
    assert len(results) == 2
    first, second = results
    # remove nondeterministic fields for comparison
    first[0].metrics = None
    second[0].metrics = None
    first[0].request_id = None
    second[0].request_id = None
    assert str(first) == str(second)
    output_count = results[0][1]
    if num_scheduler_steps == 1:
        assert output_count == 32
    else:
        assert 1 < output_count < 32
@pytest.mark.asyncio(scope="module")
 async def test_output_kinds(async_engine):
    """Test that output_kind works as expected and that
    results are equivalent across different kinds."""
    scheduler_config = await async_engine.get_scheduler_config()
    num_scheduler_steps = scheduler_config.num_scheduler_steps
    sampling_params = SamplingParams(
        temperature=0,
        max_tokens=32,
        min_tokens=32,
    )
    async def run(prompt: str, kind: RequestOutputKind):
        params = copy(sampling_params)
        params.output_kind = kind
        output_count = 0
        final_output = None
        async for output in async_engine.generate(prompt,
                                                  params,
                                                  request_id=uid()):
            output_count += 1
            final_output = output
        assert final_output is not None
        return (final_output.prompt_token_ids,
                final_output.outputs[0].token_ids,
                final_output.outputs[0].text, output_count)
    async def run_deltas(prompt: str):
        params = copy(sampling_params)
        params.output_kind = RequestOutputKind.DELTA
        prompt_tokens = None
        output_tokens: List[int] = []
        output_text = ""
        output_count = 0
        async for output in async_engine.generate(prompt,
                                                  params,
                                                  request_id=uid()):
            token_ids = output.outputs[0].token_ids
            text = output.outputs[0].text
            # Ensure we get prompt ids iff we haven't yet received output tokens
            if output_tokens:
                assert 1 <= len(token_ids) <= num_scheduler_steps
                assert text
                assert not output.prompt_token_ids
            else:
                assert output.prompt_token_ids
                prompt_tokens = output.prompt_token_ids
            output_tokens.extend(token_ids)
            output_text += text
            output_count += 1
        return prompt_tokens, output_tokens, output_text, output_count
    results = await asyncio.gather(
        run("common input prompt", RequestOutputKind.CUMULATIVE),
        run("common input prompt", RequestOutputKind.FINAL_ONLY),
        run_deltas("common input prompt"))
    # Make sure outputs are the same
    prompt_set = set(tuple(prompt_ids) for prompt_ids, _, _, _ in results)
    assert len(prompt_set) == 1
    text_set = set(text for _, _, text, _ in results)
    assert len(text_set) == 1
    tokens_set = set(tuple(ids) for _, ids, _, _ in results)
    assert len(tokens_set) == 1
    cumulative, final, deltas = results
    # output message counts
    assert cumulative[3] == deltas[3]
    if num_scheduler_steps == 1:
        assert cumulative[3] == 32
    else:
        assert 1 < cumulative[3] < 32
    assert final[3] == 1
@pytest.mark.asyncio(scope="module")
 async def test_cancellation(async_engine):
    scheduler_config = await async_engine.get_scheduler_config()
    num_scheduler_steps = scheduler_config.num_scheduler_steps
    sampling_params = SamplingParams(
        temperature=0,
-        min_tokens=10,
+        min_tokens=13,
-        max_tokens=10,
+        max_tokens=13,
    )
    stop_at = 5 if num_scheduler_steps == 1 else 1
    request_id = uid()
    i = 0
    with pytest.raises(CancelledError):
        async for output in async_engine.generate("test2",
                                                  sampling_params,
-                                                  request_id="test2"):
+                                                  request_id=request_id):
            assert not output.finished
            i += 1
-            if i == 5:
+            if i == stop_at:
-                await async_engine.abort("test2")
+                await async_engine.abort(request_id)
-    assert i == 5
+    assert i == stop_at
@pytest.mark.asyncio(scope="module")
 async def test_delayed_generator(async_engine):
    scheduler_config = await async_engine.get_scheduler_config()
    if scheduler_config.num_scheduler_steps != 1:
        pytest.skip("no need to test this one with multistep")
    sampling_params = SamplingParams(
        temperature=0,
        min_tokens=10,
        max_tokens=10,
    )
-    stream = async_engine.generate("test3",
+    stream = async_engine.generate("test3", sampling_params, request_id=uid())
                                   sampling_params,
                                   request_id="test3")
    i = 0
    final_output: Optional[RealRequestOutput] = None
    async for output in stream:
--- a/tests/async_engine/test_chat_template.py
+++ b/tests/async_engine/test_chat_template.py
@@ -1,6 +1,7 @@
 import pytest
-from vllm.entrypoints.chat_utils import apply_chat_template, load_chat_template
+from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
                                         load_chat_template)
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.transformers_utils.tokenizer import get_tokenizer
@@ -87,7 +88,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
        add_generation_prompt=add_generation_prompt)
    # Call the function and get the result
-    result = apply_chat_template(
+    result = apply_hf_chat_template(
        tokenizer,
        conversation=mock_request.messages,
        chat_template=mock_request.chat_template or template_content,
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
@@ -19,16 +19,11 @@ def server():
        "--max-model-len",
        "2048",
        "--enforce-eager",
        "--engine-use-ray",
        "--chat-template",
        str(chatml_jinja_path),
    ]
-    # Allow `--engine-use-ray`, otherwise the launch of the server throw
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
    # an error due to try to use a deprecated feature
    env_dict = {"VLLM_ALLOW_ENGINE_USE_RAY": "1"}
    with RemoteOpenAIServer(MODEL_NAME, args,
                            env_dict=env_dict) as remote_server:
        yield remote_server
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -3,12 +3,16 @@
 Run `pytest tests/basic_correctness/test_basic_correctness.py`.
 """
 import os
 import pickle
 import re
 import weakref
 from unittest.mock import patch
 import pytest
 from vllm import LLM
 from vllm.utils import is_hip
 from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
 from ..models.utils import check_outputs_equal
@@ -64,3 +68,29 @@ def test_models(
        name_0="hf",
        name_1="vllm",
    )
 def test_model_with_failure(vllm_runner) -> None:
    try:
        with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
                   side_effect=ValueError()):
            with pytest.raises(ValueError) as exc_info:
                vllm_runner("facebook/opt-125m",
                            dtype="half",
                            enforce_eager=False,
                            gpu_memory_utilization=0.7)
            matches = re.search(r"input dumped to (.+).pkl",
                                str(exc_info.value))
            assert matches is not None
            filename = f"{matches.group(1)}.pkl"
        with open(filename, "rb") as filep:
            inputs = pickle.load(filep)
        if any(key not in inputs for key in ("arg_1", "arg_2", "arg_3")):
            raise AssertionError("Missing keys in dumped inputs. Dumped keys: "
                                 f"{list(inputs.keys())}")
        assert isinstance(inputs["arg_1"],
                          ModelInputForGPUWithSamplingMetadata)
    finally:
        os.remove(filename)
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -64,6 +64,7 @@ def test_chunked_prefill_recompute(
            enable_chunked_prefill=enable_chunked_prefill,
            max_num_seqs=max_num_seqs,
            worker_use_ray=worker_use_ray,
            disable_log_stats=False,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -16,5 +16,7 @@ def test_full_graph(model):
        "The future of AI is",
    ]
    sampling_params = SamplingParams(temperature=0)
-    llm = LLM(model="meta-llama/Meta-Llama-3-8B")
+    llm = LLM(model="meta-llama/Meta-Llama-3-8B",
              enforce_eager=True,
              load_format="dummy")
    llm.generate(prompts, sampling_params)
--- a/tests/compile/test_wrapper.py
+++ b/tests/compile/test_wrapper.py
@@ -2,7 +2,7 @@ from typing import Optional
 import torch
-from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispacther
+from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
 class MyMod(torch.nn.Module):
@@ -13,7 +13,7 @@ class MyMod(torch.nn.Module):
        return x * 2
-class MyWrapper(TorchCompileWrapperWithCustomDispacther):
+class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
    def __init__(self, model):
        self.model = model
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -21,6 +21,7 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.config import TokenizerPoolConfig
 from vllm.connections import global_http_connection
 from vllm.distributed import (destroy_distributed_environment,
@@ -44,6 +45,7 @@ _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
 PromptImageInput = Union[List[Image.Image], List[List[Image.Image]]]
 PromptAudioInput = Union[List[Tuple[np.ndarray, int]],
                         List[List[Tuple[np.ndarray, int]]]]
 PromptVideoInput = Union[List[np.ndarray], List[List[np.ndarray]]]
 def _read_prompts(filename: str) -> List[str]:
@@ -85,8 +87,35 @@ class _ImageAssets(_ImageAssetsBase):
        return [prompts["stop_sign"], prompts["cherry_blossom"]]
 class _VideoAssetPrompts(TypedDict):
    sample_demo_1: str
 if sys.version_info < (3, 9):
    # UserList cannot be subscripted
    class _VideoAssetsBase(UserList):
        pass
 else:
    class _VideoAssetsBase(UserList[VideoAsset]):
        pass
 class _VideoAssets(_VideoAssetsBase):
    def __init__(self) -> None:
        super().__init__([
            VideoAsset("sample_demo_1.mp4"),
        ])
    def prompts(self, prompts: _VideoAssetPrompts) -> List[str]:
        return [prompts["sample_demo_1"]]
 IMAGE_ASSETS = _ImageAssets()
 """Singleton instance of :class:`_ImageAssets`."""
 VIDEO_ASSETS = _VideoAssets()
 """Singleton instance of :class:`_VideoAssets`."""
@pytest.fixture(autouse=True)
@@ -202,6 +231,11 @@ def image_assets() -> _ImageAssets:
    return IMAGE_ASSETS
@pytest.fixture(scope="session")
 def video_assets() -> _VideoAssets:
    return VIDEO_ASSETS
 _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature)
@@ -278,7 +312,8 @@ class HfRunner:
    def generate(
        self,
        prompts: List[str],
-        images: Optional[List[Image.Image]] = None,
+        images: Optional[PromptImageInput] = None,
        videos: Optional[List[np.ndarray]] = None,
        **kwargs: Any,
    ) -> List[Tuple[List[List[int]], List[str]]]:
        if images:
@@ -292,6 +327,8 @@ class HfRunner:
            }
            if images is not None and images[i] is not None:
                processor_kwargs["images"] = images[i]
            if videos is not None and videos[i] is not None:
                processor_kwargs["videos"] = videos[i]
            inputs = self.processor(**processor_kwargs)
            inputs = self.postprocess_inputs(inputs)
@@ -314,7 +351,7 @@ class HfRunner:
        self,
        prompts: List[str],
        max_tokens: int,
-        images: Optional[List[Image.Image]] = None,
+        images: Optional[PromptImageInput] = None,
        **kwargs: Any,
    ) -> List[Tuple[List[int], str]]:
        outputs = self.generate(prompts,
@@ -351,7 +388,8 @@ class HfRunner:
        self,
        prompts: List[str],
        max_tokens: int,
-        images: Optional[List[Image.Image]] = None,
+        images: Optional[PromptImageInput] = None,
        videos: Optional[List[np.ndarray]] = None,
        **kwargs: Any,
    ) -> List[List[torch.Tensor]]:
        all_logprobs: List[List[torch.Tensor]] = []
@@ -362,6 +400,8 @@ class HfRunner:
            }
            if images is not None and images[i] is not None:
                processor_kwargs["images"] = images[i]
            if videos is not None and videos[i] is not None:
                processor_kwargs["videos"] = videos[i]
            inputs = self.processor(**processor_kwargs)
            inputs = self.postprocess_inputs(inputs)
@@ -433,8 +473,9 @@ class HfRunner:
        prompts: List[str],
        max_tokens: int,
        num_logprobs: int,
-        images: Optional[List[Image.Image]] = None,
+        images: Optional[PromptImageInput] = None,
-        audios: Optional[List[Tuple[np.ndarray, int]]] = None,
+        audios: Optional[PromptAudioInput] = None,
        videos: Optional[List[np.ndarray]] = None,
        **kwargs: Any,
    ) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
        all_logprobs: List[List[Dict[int, float]]] = []
@@ -454,6 +495,8 @@ class HfRunner:
                processor_kwargs["audio"] = audio
                processor_kwargs["sampling_rate"] = sr
            if videos is not None:
                processor_kwargs["videos"] = videos[i]
            inputs = self.processor(**processor_kwargs)
            inputs = self.postprocess_inputs(inputs)
@@ -615,8 +658,8 @@ class VllmRunner:
            outputs.append((req_sample_output_ids, req_sample_output_strs))
        return outputs
    @staticmethod
    def _final_steps_generate_w_logprobs(
        self,
        req_outputs: List[RequestOutput],
    ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
        outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = []
@@ -634,12 +677,16 @@ class VllmRunner:
        sampling_params: SamplingParams,
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
        videos: Optional[PromptVideoInput] = None,
    ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
        assert sampling_params.logprobs is not None
        if images is not None:
            assert len(prompts) == len(images)
        if videos is not None:
            assert len(prompts) == len(videos)
        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
        if images is not None:
            for i, image in enumerate(images):
@@ -649,6 +696,11 @@ class VllmRunner:
            for i, audio in enumerate(audios):
                inputs[i]["multi_modal_data"] = {"audio": audio}
        if videos is not None:
            for i, video in enumerate(videos):
                inputs[i]["multi_modal_data"] = {"video": video}
        print(f"[INPUTS!!!!]: {inputs}, {sampling_params}")
        req_outputs = self.model.generate(inputs,
                                          sampling_params=sampling_params)
        return self._final_steps_generate_w_logprobs(req_outputs)
@@ -671,7 +723,7 @@ class VllmRunner:
        self,
        prompts: List[str],
        max_tokens: int,
-        images: Optional[List[Image.Image]] = None,
+        images: Optional[PromptImageInput] = None,
    ) -> List[Tuple[List[int], str]]:
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
        outputs = self.generate(prompts, greedy_params, images=images)
@@ -685,6 +737,7 @@ class VllmRunner:
        num_logprobs: int,
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
        videos: Optional[PromptVideoInput] = None,
        stop_token_ids: Optional[List[int]] = None,
    ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
        greedy_logprobs_params = SamplingParams(temperature=0.0,
@@ -694,7 +747,8 @@ class VllmRunner:
        outputs = self.generate_w_logprobs(prompts,
                                           greedy_logprobs_params,
                                           images=images,
-                                           audios=audios)
+                                           audios=audios,
                                           videos=videos)
        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]
--- a/tests/distributed/test_multimodal_broadcast.py
+++ b/tests/distributed/test_multimodal_broadcast.py
@@ -35,9 +35,11 @@ def test_models(hf_runner, vllm_runner, image_assets, model: str,
    if model.startswith("llava-hf/llava-1.5"):
        from ..models.test_llava import models, run_test
    elif model.startswith("llava-hf/llava-v1.6"):
-        from ..models.test_llava_next import models, run_test
+        from ..models.test_llava_next import run_test  # type: ignore[no-redef]
        from ..models.test_llava_next import models
    elif model.startswith("facebook/chameleon"):
-        from ..models.test_chameleon import models, run_test
+        from ..models.test_chameleon import run_test  # type: ignore[no-redef]
        from ..models.test_chameleon import models
    else:
        raise NotImplementedError(f"Unsupported model: {model}")
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -18,23 +18,29 @@ logger = init_logger("test_pipeline_parallel")
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
-@pytest.mark.parametrize(("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, "
+@pytest.mark.parametrize(
-                          "MODEL_NAME, DIST_BACKEND"),
+    ("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, TRUST_REMOTE_CODE, "
-                         [
+     "MODEL_NAME, DIST_BACKEND"),
-                             (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
+    [
-                             (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+        (2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-                             (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+        (2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-                             (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
+        (1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-                             (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+        (1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-                             (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+        (1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-                             (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
+        (1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-                             (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+        (1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-                             (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+        (1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-                             (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
+        (2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-                         ])
+        (2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
        # TODO: Enable internVL2 in a separate test if needed
        # (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "ray"),
        # (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "ray"),
        # (1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "ray"),
    ],
 )
@fork_new_process_for_each_test
-def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
+def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
-                    DIST_BACKEND):
+                    TRUST_REMOTE_CODE, MODEL_NAME, DIST_BACKEND):
    if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
        pytest.skip("Skipping multi-node pipeline parallel test for "
                    "multiprocessing distributed backend")
@@ -43,6 +49,8 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "float16",
        "--max-model-len",
        "8192",
        "--pipeline-parallel-size",
        str(PP_SIZE),
        "--tensor-parallel-size",
@@ -59,7 +67,9 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
    tp_args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
-        "bfloat16",
+        "float16",
        "--max-model-len",
        "8192",
        "--tensor-parallel-size",
        str(max(TP_SIZE, 2)),  # We only use 2 GPUs in the CI.
        "--distributed-executor-backend",
@@ -71,6 +81,9 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
    if EAGER_MODE:
        pp_args.append("--enforce-eager")
        tp_args.append("--enforce-eager")
    if TRUST_REMOTE_CODE:
        pp_args.append("--trust-remote-code")
        tp_args.append("--trust-remote-code")
    pp_env = None
    if (DIST_BACKEND == "ray" and TP_SIZE == 2 and PP_SIZE == 2
            and CHUNKED_PREFILL):
--- a/tests/engine/test_multiproc_workers.py
+++ b/tests/engine/test_multiproc_workers.py
@@ -83,7 +83,7 @@ def test_local_workers() -> None:
    workers[3].process.kill()
    # Other workers should get shut down here
-    worker_monitor.join(2)
+    worker_monitor.join(20)
    # Ensure everything is stopped
    assert not worker_monitor.is_alive()
@@ -108,7 +108,7 @@ def test_local_workers_clean_shutdown() -> None:
    # Clean shutdown
    worker_monitor.close()
-    worker_monitor.join(5)
+    worker_monitor.join(20)
    # Ensure everything is stopped
    assert not worker_monitor.is_alive()
@@ -161,7 +161,7 @@ async def test_local_workers_async() -> None:
    workers[3].process.kill()
    # Other workers should get shut down here
-    worker_monitor.join(2)
+    worker_monitor.join(20)
    # Ensure everything is stopped
    assert not worker_monitor.is_alive()
--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -11,9 +11,10 @@ def test_skip_tokenizer_initialization(model: str):
    # token ids.
    llm = LLM(model=model, skip_tokenizer_init=True)
    sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
-    with pytest.raises(ValueError) as err:
+
    with pytest.raises(ValueError, match="cannot pass text prompts when"):
        llm.generate("abc", sampling_params)
-    assert "prompts must be None if" in str(err.value)
+
    outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
                           sampling_params=sampling_params)
    assert len(outputs) > 0
--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -50,7 +50,7 @@ def zephyr_lora_files():
@pytest.mark.skip_global_cleanup
 def test_multiple_lora_requests(llm: LLM, zephyr_lora_files):
    lora_request = [
-        LoRARequest(LORA_NAME, idx + 1, zephyr_lora_files)
+        LoRARequest(LORA_NAME + str(idx), idx + 1, zephyr_lora_files)
        for idx in range(len(PROMPTS))
    ]
    # Multiple SamplingParams should be matched with each prompt
--- a/tests/entrypoints/offline_mode/init.py
+++ b/tests/entrypoints/offline_mode/init.py
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -0,0 +1,77 @@
 """Tests for HF_HUB_OFFLINE mode"""
 import importlib
 import sys
 import weakref
 import pytest
 from vllm import LLM
 from ...conftest import cleanup
 MODEL_NAME = "facebook/opt-125m"
@pytest.fixture(scope="module")
 def llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(model=MODEL_NAME,
              max_num_batched_tokens=4096,
              tensor_parallel_size=1,
              gpu_memory_utilization=0.10,
              enforce_eager=True)
    with llm.deprecate_legacy_api():
        yield weakref.proxy(llm)
        del llm
    cleanup()
@pytest.mark.skip_global_cleanup
 def test_offline_mode(llm: LLM, monkeypatch):
    # we use the llm fixture to ensure the model files are in-cache
    del llm
    # Set HF to offline mode and ensure we can still construct an LLM
    try:
        monkeypatch.setenv("HF_HUB_OFFLINE", "1")
        # Need to re-import huggingface_hub and friends to setup offline mode
        _re_import_modules()
        # Cached model files should be used in offline mode
        LLM(model=MODEL_NAME,
            max_num_batched_tokens=4096,
            tensor_parallel_size=1,
            gpu_memory_utilization=0.10,
            enforce_eager=True)
    finally:
        # Reset the environment after the test
        # NB: Assuming tests are run in online mode
        monkeypatch.delenv("HF_HUB_OFFLINE")
        _re_import_modules()
        pass
 def _re_import_modules():
    hf_hub_module_names = [
        k for k in sys.modules if k.startswith("huggingface_hub")
    ]
    transformers_module_names = [
        k for k in sys.modules if k.startswith("transformers")
        and not k.startswith("transformers_modules")
    ]
    reload_exception = None
    for module_name in hf_hub_module_names + transformers_module_names:
        try:
            importlib.reload(sys.modules[module_name])
        except Exception as e:
            reload_exception = e
            # Try to continue clean up so that other tests are less likely to
            # be affected
    # Error this test if reloading a module failed
    if reload_exception is not None:
        raise reload_exception
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -8,7 +8,9 @@ from vllm.entrypoints.openai.protocol import BatchRequestOutput
 INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
-{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
+{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
 {"custom_id": "request-4", "method": "POST", "url": "/bad_url", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
 {"custom_id": "request-5", "method": "POST", "url": "/v1/chat/completions", "body": {"stream": "True", "model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
 INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -0,0 +1,107 @@
 from http import HTTPStatus
 from unittest.mock import MagicMock
 import pytest
 from vllm.config import ModelConfig
 from vllm.engine.protocol import AsyncEngineClient
 from vllm.entrypoints.openai.protocol import (ErrorResponse,
                                              LoadLoraAdapterRequest,
                                              UnloadLoraAdapterRequest)
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 MODEL_NAME = "meta-llama/Llama-2-7b"
 LORA_LOADING_SUCCESS_MESSAGE = (
    "Success: LoRA adapter '{lora_name}' added successfully.")
 LORA_UNLOADING_SUCCESS_MESSAGE = (
    "Success: LoRA adapter '{lora_name}' removed successfully.")
 async def _async_serving_engine_init():
    mock_engine_client = MagicMock(spec=AsyncEngineClient)
    mock_model_config = MagicMock(spec=ModelConfig)
    # Set the max_model_len attribute to avoid missing attribute
    mock_model_config.max_model_len = 2048
    serving_engine = OpenAIServing(mock_engine_client,
                                   mock_model_config,
                                   served_model_names=[MODEL_NAME],
                                   lora_modules=None,
                                   prompt_adapters=None,
                                   request_logger=None)
    return serving_engine
@pytest.mark.asyncio
 async def test_load_lora_adapter_success():
    serving_engine = await _async_serving_engine_init()
    request = LoadLoraAdapterRequest(lora_name="adapter",
                                     lora_path="/path/to/adapter2")
    response = await serving_engine.load_lora_adapter(request)
    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
    assert len(serving_engine.lora_requests) == 1
    assert serving_engine.lora_requests[0].lora_name == "adapter"
@pytest.mark.asyncio
 async def test_load_lora_adapter_missing_fields():
    serving_engine = await _async_serving_engine_init()
    request = LoadLoraAdapterRequest(lora_name="", lora_path="")
    response = await serving_engine.load_lora_adapter(request)
    assert isinstance(response, ErrorResponse)
    assert response.type == "InvalidUserInput"
    assert response.code == HTTPStatus.BAD_REQUEST
@pytest.mark.asyncio
 async def test_load_lora_adapter_duplicate():
    serving_engine = await _async_serving_engine_init()
    request = LoadLoraAdapterRequest(lora_name="adapter1",
                                     lora_path="/path/to/adapter1")
    response = await serving_engine.load_lora_adapter(request)
    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(
        lora_name='adapter1')
    assert len(serving_engine.lora_requests) == 1
    request = LoadLoraAdapterRequest(lora_name="adapter1",
                                     lora_path="/path/to/adapter1")
    response = await serving_engine.load_lora_adapter(request)
    assert isinstance(response, ErrorResponse)
    assert response.type == "InvalidUserInput"
    assert response.code == HTTPStatus.BAD_REQUEST
    assert len(serving_engine.lora_requests) == 1
@pytest.mark.asyncio
 async def test_unload_lora_adapter_success():
    serving_engine = await _async_serving_engine_init()
    request = LoadLoraAdapterRequest(lora_name="adapter1",
                                     lora_path="/path/to/adapter1")
    response = await serving_engine.load_lora_adapter(request)
    assert len(serving_engine.lora_requests) == 1
    request = UnloadLoraAdapterRequest(lora_name="adapter1")
    response = await serving_engine.unload_lora_adapter(request)
    assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(
        lora_name='adapter1')
    assert len(serving_engine.lora_requests) == 0
@pytest.mark.asyncio
 async def test_unload_lora_adapter_missing_fields():
    serving_engine = await _async_serving_engine_init()
    request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None)
    response = await serving_engine.unload_lora_adapter(request)
    assert isinstance(response, ErrorResponse)
    assert response.type == "InvalidUserInput"
    assert response.code == HTTPStatus.BAD_REQUEST
@pytest.mark.asyncio
 async def test_unload_lora_adapter_not_found():
    serving_engine = await _async_serving_engine_init()
    request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter")
    response = await serving_engine.unload_lora_adapter(request)
    assert isinstance(response, ErrorResponse)
    assert response.type == "InvalidUserInput"
    assert response.code == HTTPStatus.BAD_REQUEST
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -3,8 +3,10 @@ from typing import Type
 import pytest
 import torch
 from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
-                                                   NewGELU, SiluAndMul)
+                                                   NewGELU, QuickGELU,
                                                   SiluAndMul)
 from .allclose_default import get_default_atol, get_default_rtol
@@ -39,18 +41,28 @@ def test_act_and_mul(
    x = torch.randn(num_tokens, 2 * d, dtype=dtype)
    if activation == "silu":
        layer = SiluAndMul()
        fn = torch.ops._C.silu_and_mul
    elif activation == "gelu":
        layer = GeluAndMul(approximate="none")
        fn = torch.ops._C.gelu_and_mul
    elif activation == "gelu_tanh":
        layer = GeluAndMul(approximate="tanh")
        fn = torch.ops._C.gelu_tanh_and_mul
    out = layer(x)
    ref_out = layer.forward_native(x)
    # The SiLU and GELU implementations are equivalent to the native PyTorch
    # implementations, so we can do exact comparison.
    torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
    d = x.shape[-1] // 2
    output_shape = (x.shape[:-1] + (d, ))
    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
    opcheck(fn, (out, x))
-@pytest.mark.parametrize("activation", [FastGELU, NewGELU])
+
@pytest.mark.parametrize("activation", [(FastGELU, torch.ops._C.gelu_fast),
                                        (NewGELU, torch.ops._C.gelu_new),
                                        (QuickGELU, torch.ops._C.gelu_quick)])
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("d", D)
@pytest.mark.parametrize("dtype", DTYPES)
@@ -70,10 +82,14 @@ def test_activation(
        torch.cuda.manual_seed(seed)
    torch.set_default_device(device)
    x = torch.randn(num_tokens, d, dtype=dtype)
-    layer = activation()
+    layer = activation[0]()
    fn = activation[1]
    out = layer(x)
    ref_out = layer.forward_native(x)
    torch.testing.assert_close(out,
                               ref_out,
                               atol=get_default_atol(out),
                               rtol=get_default_rtol(out))
    out = torch.empty_like(x)
    opcheck(fn, (out, x))
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -6,6 +6,7 @@ import torch
 from xformers import ops as xops
 from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
 from vllm.utils import get_max_shared_memory_bytes, is_hip
@@ -198,6 +199,13 @@ def test_paged_attention(
            k_scale,
            v_scale,
        )
        opcheck(torch.ops._C.paged_attention_v1,
                (output, query, key_cache, value_cache, num_kv_heads, scale,
                 block_tables, seq_lens, block_size, max_seq_len, alibi_slopes,
                 kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0),
                cond=(head_size == HEAD_SIZES[0]))
    elif version == "v2":
        num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
        assert PARTITION_SIZE % block_size == 0
@@ -230,6 +238,14 @@ def test_paged_attention(
            k_scale,
            v_scale,
        )
        opcheck(torch.ops._C.paged_attention_v2,
                (output, exp_sums, max_logits, tmp_output, query, key_cache,
                 value_cache, num_kv_heads, scale, block_tables, seq_lens,
                 block_size, max_seq_len, alibi_slopes, kv_cache_dtype,
                 k_scale, v_scale, 0, 0, 0, 64, 0),
                cond=(head_size == HEAD_SIZES[0]))
    else:
        raise AssertionError(f"Unknown version: {version}")
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -4,6 +4,7 @@ from typing import List, Tuple
 import pytest
 import torch
 from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
 from vllm import _custom_ops as ops
 COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
@@ -87,6 +88,11 @@ def test_copy_blocks(
    block_mapping_tensor = torch.tensor(block_mapping,
                                        dtype=torch.int64,
                                        device=device).view(-1, 2)
    opcheck(torch.ops._C_cache_ops.copy_blocks,
            (key_caches, value_caches, block_mapping_tensor),
            test_utils=DEFAULT_OPCHECK_TEST_UTILS,
            cond=(head_size == HEAD_SIZES[0]))
    ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
    # Run the reference implementation.
@@ -162,6 +168,10 @@ def test_reshape_and_cache(
    k_scale = v_scale = 1.0
    # Call the reshape_and_cache kernel.
    opcheck(torch.ops._C_cache_ops.reshape_and_cache,
            (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
             k_scale, v_scale),
            cond=(head_size == HEAD_SIZES[0]))
    ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping,
                          kv_cache_dtype, k_scale, v_scale)
@@ -269,6 +279,10 @@ def test_reshape_and_cache_flash(
    k_scale = v_scale = 1.0
    # Call the reshape_and_cache kernel.
    opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash,
            (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
             k_scale, v_scale),
            cond=(head_size == HEAD_SIZES[0]))
    ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
                                slot_mapping, kv_cache_dtype, k_scale, v_scale)
@@ -366,6 +380,14 @@ def test_swap_blocks(
    src_value_caches_clone = src_value_caches[0].clone()
    # Call the swap_blocks kernel.
    do_opcheck = (head_size == HEAD_SIZES[0])
    opcheck(torch.ops._C_cache_ops.swap_blocks,
            (src_key_caches[0], dist_key_caches[0], block_mapping_tensor),
            cond=do_opcheck)
    opcheck(torch.ops._C_cache_ops.swap_blocks,
            (src_value_caches[0], dist_value_caches[0], block_mapping_tensor),
            cond=do_opcheck)
    ops.swap_blocks(src_key_caches[0], dist_key_caches[0],
                    block_mapping_tensor)
    ops.swap_blocks(src_value_caches[0], dist_value_caches[0],
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -7,6 +7,7 @@ from typing import Optional, Type
 import pytest
 import torch
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
@@ -108,6 +109,9 @@ def cutlass_int8_gemm_helper(m: int,
    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
    opcheck(torch.ops._C.cutlass_scaled_mm,
            (out, a, b, scale_a, scale_b, bias))
@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 100, 33])
@pytest.mark.parametrize("n", [2048, 4096, 8192, 16384, 24576, 256, 1024])
@@ -341,6 +345,15 @@ def test_cutlass_int8_azp(m: int, n: int, k: int, out_dtype: torch.dtype,
    torch.testing.assert_close(out, baseline_dq, rtol=rtol, atol=atol)
    torch.testing.assert_close(out, baseline_q, rtol=rtol, atol=atol)
    if azp_per_token:
        opcheck(torch.ops._C.cutlass_scaled_mm_azp,
                (out, aq_i8, bq_i8, scale_a, scale_b, azp_adj_i32, azp_i32,
                 func_bias))
    else:
        opcheck(torch.ops._C.cutlass_scaled_mm_azp,
                (out, aq_i8, bq_i8, scale_a, scale_b, azp_with_adj_i32, None,
                 func_bias))
 # Test working with a subset of A and B
 def test_cutlass_subset():
--- a/tests/kernels/test_flashinfer.py
+++ b/tests/kernels/test_flashinfer.py
@@ -445,7 +445,8 @@ def test_flashinfer_decode_with_paged_fp8_kv(
                          head_size,
                          block_size,
                          "NONE",
-                          data_type=dtype)
+                          data_type=dtype,
                          q_data_type=dtype)
    output = wrapper.forward(query,
                             kv_cache_fp8,
                             logits_soft_cap=soft_cap,
--- a/tests/kernels/test_int8_quant.py
+++ b/tests/kernels/test_int8_quant.py
@@ -2,6 +2,7 @@ import pytest
 import torch
 from tests.kernels.quant_utils import ref_dynamic_per_token_quant
 from tests.kernels.utils import opcheck
 from vllm._custom_ops import scaled_int8_quant
 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -12,6 +13,16 @@ SEEDS = [0]
 SCALE = [0.1, 0.5, 0.8, 1.2, 2.1]
 def opcheck_int8_quant(output, input, scale=None):
    if scale is not None:
        opcheck(torch.ops._C.static_scaled_int8_quant, (output, input, scale))
    else:
        scale = torch.empty((input.numel() // input.shape[-1], 1),
                            device=input.device,
                            dtype=torch.float32)
        opcheck(torch.ops._C.dynamic_scaled_int8_quant, (output, input, scale))
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
@@ -34,6 +45,8 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
        ops_out, ref_out, atol=1,
        rtol=0.0)  # big atol to account for rounding errors
    opcheck_int8_quant(ops_out, x)
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@@ -58,3 +71,5 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
    torch.testing.assert_close(
        out1, out2, atol=1,
        rtol=0.0)  # big atol to account for rounding errors
    opcheck_int8_quant(out2, x, scale)
--- a/tests/kernels/test_layernorm.py
+++ b/tests/kernels/test_layernorm.py
@@ -1,6 +1,7 @@
 import pytest
 import torch
 from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.layernorm import RMSNorm
 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -52,3 +53,10 @@ def test_rms_norm(
        torch.testing.assert_close(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
    else:
        torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
    if residual is not None:
        opcheck(torch.ops._C.fused_add_rms_norm,
                (x, residual, layer.weight.data, layer.variance_epsilon))
    else:
        opcheck(torch.ops._C.rms_norm,
                (out, x, layer.weight.data, layer.variance_epsilon))
--- a/tests/kernels/test_machete_gemm.py
+++ b/tests/kernels/test_machete_gemm.py
@@ -9,6 +9,7 @@ from typing import Optional, Tuple
 import pytest
 import torch
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    pack_rows, quantize_weights)
@@ -76,6 +77,8 @@ def machete_quantize_and_pack(w: torch.Tensor,
    w_q = w_q.t().contiguous().t()  # convert to col major
    w_q_machete = ops.machete_prepack_B(w_q, wtype)
    opcheck(torch.ops._C.machete_prepack_B, (w_q, wtype))
    return w_ref, w_q_machete, w_s, w_zp
@@ -146,6 +149,10 @@ def test_machete_all_schedules(shape, atype: torch.dtype,
            schedule=schedule,
        )
        opcheck(torch.ops._C.machete_gemm,
                (a, w_q_machete, wtype, w_s, maybe_convert_zeropoints(
                    w_zp, w_s), group_size, None, None, None, schedule))
        # Relax atol as our reduction dim becomes larger (more rounding error)
        # Relax atol when we have zeropoints since the way machete applies
        #  zeropoints (after scales) causes noise around 0
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -5,6 +5,7 @@ Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.
 import pytest
 import torch
 from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
 from tests.quantization.utils import is_quant_method_supported
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
@@ -73,12 +74,9 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
                            act_order, mnk_factors):
    m_factor, n_factor, k_factor = mnk_factors
    size_m = m_factor
    size_k = k_chunk * k_factor
    size_n = n_chunk * n_factor
    print(f"MNK = {size_m} {size_n} {size_k}")
    # Filter act_order
    if act_order:
        if group_size == -1:
@@ -112,6 +110,9 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
                                  weight_perm)
    opcheck(torch.ops._C.gptq_marlin_repack,
            (q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits))
    # Run Marlin repack GPU kernel
    marlin_q_w_2 = ops.gptq_marlin_repack(
        q_w_gptq,
@@ -137,12 +138,9 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
                           mnk_factors):
    m_factor, n_factor, k_factor = mnk_factors
    size_m = m_factor
    size_k = k_chunk * k_factor
    size_n = n_chunk * n_factor
    print(f"MNK = {size_m} {size_n} {size_k}")
    # Normalize group_size
    if group_size == -1:
        group_size = size_k
@@ -165,6 +163,9 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
                                  weight_perm)
    opcheck(torch.ops._C.awq_marlin_repack,
            (q_w_awq, size_k, size_n, quant_type.size_bits))
    # Run Marlin repack GPU kernel
    marlin_q_w_2 = ops.awq_marlin_repack(
        q_w_awq,
@@ -204,9 +205,6 @@ def test_gptq_marlin_gemm(
    size_k = k_chunk * k_factor
    size_n = n_chunk * n_factor
    print(f"MNK = {size_m} {size_n} {size_k}")
    print(f"groupsize = {group_size}")
    if act_order:
        if group_size == -1:
            return
@@ -224,6 +222,13 @@ def test_gptq_marlin_gemm(
    workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
                                GPTQ_MARLIN_MAX_PARALLEL)
    opcheck(
        torch.ops._C.gptq_marlin_gemm,
        (a_input, marlin_q_w, marlin_s, marlin_zp, g_idx, sort_indices,
         workspace.scratch, quant_type, a_input.shape[0], b_weight.shape[1],
         a_input.shape[1], is_k_full, False, use_fp32_reduce),
        test_utils=DEFAULT_OPCHECK_TEST_UTILS)
    output = ops.gptq_marlin_gemm(
        a_input,
        marlin_q_w,
@@ -245,7 +250,6 @@ def test_gptq_marlin_gemm(
    torch.cuda.synchronize()
    max_diff = compute_max_diff(output, output_ref)
    print("max_diff = {}".format(max_diff))
    assert max_diff < 0.04
@@ -265,9 +269,6 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
    size_k = k_chunk * k_factor
    size_n = n_chunk * n_factor
    print(f"MNK = {size_m} {size_n} {size_k}")
    print(f"groupsize = {group_size}")
    a_input = rand_data((size_m, size_k))
    b_weight = rand_data((size_k, size_n))
@@ -279,6 +280,12 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
    output_ref = torch.matmul(a_input, w_24_ref)
    opcheck(torch.ops._C.gptq_marlin_24_gemm,
            (a_input, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s,
             workspace_24.scratch, quant_type, a_input.shape[0],
             b_weight.shape[1], a_input.shape[1]),
            test_utils=DEFAULT_OPCHECK_TEST_UTILS)
    output = ops.gptq_marlin_24_gemm(
        a_input,
        marlin_24_q_w_comp,
@@ -294,7 +301,6 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
    torch.cuda.synchronize()
    max_diff = compute_max_diff(output, output_ref)
    print("max_diff = {}".format(max_diff))
    assert max_diff < 0.04
@@ -321,9 +327,6 @@ def test_fp8_marlin_gemm(
    size_k = k_chunk * k_factor
    size_n = n_chunk * n_factor
    print(f"MNK = {size_m} {size_n} {size_k}")
    print(f"groupsize = {group_size}")
    a_input = rand_data((size_m, size_k), dtype=dtype)
    b_weight = rand_data((size_k, size_n), dtype=dtype)
@@ -353,6 +356,10 @@ def test_fp8_marlin_gemm(
    workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
                                GPTQ_MARLIN_MAX_PARALLEL)
    opcheck(torch.ops._C.fp8_marlin_gemm,
            (a_input, marlin_qweight, marlin_scales, workspace.scratch,
             num_bits, a_input.shape[0], b_weight.shape[1], a_input.shape[1]))
    output = ops.fp8_marlin_gemm(
        a=a_input,
        b_q_weight=marlin_qweight,
@@ -368,7 +375,6 @@ def test_fp8_marlin_gemm(
    torch.cuda.synchronize()
    max_diff = compute_max_diff(output, output_ref)
    print("max_diff = {}".format(max_diff))
    assert max_diff < 0.04
@@ -396,9 +402,6 @@ def test_awq_marlin_gemm(
    size_k = k_chunk * k_factor
    size_n = n_chunk * n_factor
    print(f"MNK = {size_m} {size_n} {size_k}")
    print(f"groupsize = {group_size}")
    a_input = rand_data((size_m, size_k))
    b_weight = rand_data((size_k, size_n))
@@ -434,7 +437,6 @@ def test_awq_marlin_gemm(
    torch.cuda.synchronize()
    max_diff = compute_max_diff(output, output_ref)
    print("max_diff = {}".format(max_diff))
    assert max_diff < 0.04
@@ -460,9 +462,6 @@ def test_marlin_qqq_gemm(
    size_k = k_chunk * k_factor
    size_n = n_chunk * n_factor
    print(f"MNK = {size_m} {size_n} {size_k}")
    print(f"groupsize = {group_size}")
    a_input = rand_data((size_m, size_k))
    b_weight = rand_data((size_k, size_n))
@@ -479,6 +478,11 @@ def test_marlin_qqq_gemm(
    workspace = MarlinWorkspace(size_n, MARLIN_QQQ_MIN_THREAD_N,
                                MARLIN_QQQ_MAX_PARALLEL)
    opcheck(torch.ops._C.marlin_qqq_gemm,
            (q_a, marlin_qqq_q_w, s_a, marlin_qqq_s_channel,
             marlin_qqq_s_group, workspace.scratch, a_input.shape[0],
             b_weight.shape[1], a_input.shape[1]))
    output = ops.marlin_qqq_gemm(
        q_a,
        marlin_qqq_q_w,
@@ -495,6 +499,5 @@ def test_marlin_qqq_gemm(
    torch.cuda.synchronize()
    max_diff = compute_max_diff(output, output_ref)
    print("max_diff = {}".format(max_diff))
    assert max_diff < 0.04
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -2,6 +2,8 @@
 Run `pytest tests/kernels/test_moe.py`.
 """
 from typing import List
 import pytest
 import torch
 from transformers import MixtralConfig
@@ -9,7 +11,13 @@ from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
    fused_marlin_moe, single_marlin_moe)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
    marlin_quantize)
 from vllm.model_executor.models.mixtral import MixtralMoE
 from vllm.scalar_type import scalar_types
 def torch_moe(a, w1, w2, score, topk):
@@ -29,6 +37,20 @@ def torch_moe(a, w1, w2, score, topk):
            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
 def torch_moe_single(a, w, score, topk):
    B, D = a.shape
    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
    out = torch.zeros(B * topk, w.shape[1], dtype=a.dtype, device=a.device)
    score = torch.softmax(score, dim=-1, dtype=torch.float32)
    _, topk_ids = torch.topk(score, topk)
    topk_ids = topk_ids.view(-1)
    for i in range(w.shape[0]):
        mask = topk_ids == i
        if mask.sum():
            out[mask] = a[mask] @ w[i].transpose(0, 1)
    return (out.view(B, -1, w.shape[1])).sum(dim=1)
@pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1])
@pytest.mark.parametrize("n", [2048, 256, 1024])
@pytest.mark.parametrize("k", [128, 511, 1024])
@@ -43,11 +65,11 @@ def test_fused_moe(
    topk: int,
    dtype: torch.dtype,
 ):
-    a = torch.randn((m, k), device='cuda', dtype=dtype) / 10
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
-    w1 = torch.randn((e, 2 * n, k), device='cuda', dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
-    w2 = torch.randn((e, k, n), device='cuda', dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
-    score = torch.randn((m, e), device='cuda', dtype=dtype)
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
    triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False)
    torch_output = torch_moe(a, w1, w2, score, topk)
    torch.testing.assert_close(triton_output, torch_output, atol=1e-2, rtol=0)
@@ -99,3 +121,194 @@ def test_mixtral_moe(dtype: torch.dtype):
                               vllm_states,
                               rtol=mixtral_moe_tol[dtype],
                               atol=mixtral_moe_tol[dtype])
 def stack_and_dev(tensors: List[torch.Tensor]):
    dev = tensors[0].device
    return torch.stack(tensors, dim=0).to(dev)
 def compute_max_diff(output, output_ref):
    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
        torch.abs(output_ref))
@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
@pytest.mark.parametrize("k", [128, 1024, 512])
@pytest.mark.parametrize("e", [4, 8, 64])
@pytest.mark.parametrize("topk", [2, 6])
@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
@pytest.mark.parametrize("act_order", [True, False])
 def test_fused_marlin_moe(
    m: int,
    n: int,
    k: int,
    e: int,
    topk: int,
    group_size: int,
    act_order: bool,
 ):
    torch.manual_seed(7)
    if topk > e:
        return
    # Filter act_order
    if act_order:
        if group_size == -1:
            return
        if group_size in (k, n):
            return
    quant_type = scalar_types.uint4b8
    dtype = torch.float16
    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
    for i in range(w2.shape[0]):
        w2[0] = torch.eye(k, n, device="cuda", dtype=dtype)
    w_ref1_l = []
    qweight1_l = []
    scales1_l = []
    g_idx1_l = []
    sort_indices1_l = []
    for i in range(w1.shape[0]):
        test_perm = torch.randperm(k)
        w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = marlin_quantize(
            w1[i].transpose(1, 0), quant_type, group_size, act_order,
            test_perm)
        w_ref1_l.append(w_ref1)
        qweight1_l.append(qweight1)
        scales1_l.append(scales1)
        g_idx1_l.append(g_idx1)
        sort_indices1_l.append(sort_indices1)
    w_ref1 = stack_and_dev(w_ref1_l)
    qweight1 = stack_and_dev(qweight1_l).contiguous()
    scales1 = stack_and_dev(scales1_l)
    g_idx1 = stack_and_dev(g_idx1_l)
    sort_indices1 = stack_and_dev(sort_indices1_l)
    w_ref2_l = []
    qweight2_l = []
    scales2_l = []
    g_idx2_l = []
    sort_indices2_l = []
    for i in range(w2.shape[0]):
        test_perm = torch.randperm(n)
        w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = marlin_quantize(
            w2[i].transpose(1, 0), quant_type, group_size, act_order,
            test_perm)
        w_ref2_l.append(w_ref2)
        qweight2_l.append(qweight2)
        scales2_l.append(scales2)
        g_idx2_l.append(g_idx2)
        sort_indices2_l.append(sort_indices2)
    w_ref2 = stack_and_dev(w_ref2_l)
    qweight2 = stack_and_dev(qweight2_l).contiguous()
    scales2 = stack_and_dev(scales2_l)
    g_idx2 = stack_and_dev(g_idx2_l)
    sort_indices2 = stack_and_dev(sort_indices2_l)
    score = torch.randn((m, e), device="cuda", dtype=dtype)
    topk_weights, topk_ids = fused_topk(a, score, topk, False)
    triton_output = fused_moe(
        a,
        w_ref1.transpose(1, 2).contiguous(),
        w_ref2.transpose(1, 2).contiguous(),
        score,
        topk,
        renormalize=False,
    )
    marlin_output = fused_marlin_moe(
        a,
        qweight1,
        qweight2,
        score,
        g_idx1,
        g_idx2,
        sort_indices1,
        sort_indices2,
        topk_weights,
        topk_ids,
        w1_scale=scales1,
        w2_scale=scales2,
    )
    assert compute_max_diff(marlin_output, triton_output) < 4e-2
@pytest.mark.skip("This test is here for the sake of debugging, "
                  "don't run it in automated tests.")
@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
@pytest.mark.parametrize("k", [128, 1024, 512])
@pytest.mark.parametrize("e", [4, 8, 64])
@pytest.mark.parametrize("topk", [2, 6])
@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
@pytest.mark.parametrize("act_order", [True, False])
 def test_marlin_moe_mmm(
    m: int,
    n: int,
    k: int,
    e: int,
    topk: int,
    group_size: int,
    act_order: bool,
 ):
    if topk > e:
        return
    # Filter act_order
    if act_order:
        if group_size == -1:
            return
        if group_size == k:
            return
    quant_type = scalar_types.uint4b8
    dtype = torch.float16
    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
    w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10
    w_ref_l = []
    qweights_l = []
    scales_l = []
    g_idx_l = []
    sort_indices_l = []
    for i in range(w.shape[0]):
        test_perm = torch.randperm(k)
        w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize(
            w[i].transpose(1, 0), quant_type, group_size, act_order, test_perm)
        w_ref_l.append(w_ref)
        qweights_l.append(qweight)
        scales_l.append(scales)
        g_idx_l.append(g_idx)
        sort_indices_l.append(sort_indices)
    w_ref = stack_and_dev(w_ref_l)
    qweight = stack_and_dev(qweights_l).contiguous()
    scales = stack_and_dev(scales_l)
    g_idx = stack_and_dev(g_idx_l)
    sort_indices = stack_and_dev(sort_indices_l)
    score = torch.randn((m, e), device="cuda", dtype=dtype)
    marlin_output = single_marlin_moe(a,
                                      qweight,
                                      scales,
                                      score,
                                      g_idx,
                                      sort_indices,
                                      topk,
                                      renormalize=False)
    torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
    assert compute_max_diff(marlin_output, torch_output) < 1e-2
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -3,7 +3,8 @@
 import itertools
 import random
 from numbers import Number
-from typing import Any, List, NamedTuple, Optional, Tuple, Union
+from typing import (Any, Dict, List, NamedTuple, Optional, Sequence, Tuple,
                    Union)
 import pytest
 import torch
@@ -13,6 +14,21 @@ from vllm.attention.backends.xformers import XFormersBackend
 from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL,
                        make_tensor_with_pad)
 # For now, disable "test_aot_dispatch_dynamic" since there are some
 # bugs related to this test in PyTorch 2.4.
 DEFAULT_OPCHECK_TEST_UTILS: Tuple[str, ...] = (
    "test_schema",
    "test_autograd_registration",
    "test_faketensor",
 )
 ALL_OPCHECK_TEST_UTILS: Tuple[str, ...] = (
    "test_schema",
    "test_autograd_registration",
    "test_faketensor",
    "test_aot_dispatch_dynamic",
 )
 class QKVInputs(NamedTuple):
    '''
@@ -926,3 +942,19 @@ def assert_actual_matches_ideal(test_params: PhaseTestParameters,
    ideal_output = test_params.packed_qkvo.ideal_output
    torch.testing.assert_close(ideal_output,
                               output_under_test.view_as(ideal_output))
 def opcheck(op: Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket,
                      torch._library.custom_ops.CustomOpDef],
            args: Tuple[Any, ...],
            kwargs: Optional[Dict[str, Any]] = None,
            *,
            test_utils: Union[str, Sequence[str]] = ALL_OPCHECK_TEST_UTILS,
            raise_exception: bool = True,
            cond: bool = True) -> Dict[str, str]:
    return torch.library.opcheck(
        op,
        args,
        kwargs,
        test_utils=test_utils,
        raise_exception=raise_exception) if cond else {}
--- a/tests/models/fixtures/pixtral_chat.json
+++ b/tests/models/fixtures/pixtral_chat.json
--- a/tests/models/fixtures/pixtral_chat_engine.json
+++ b/tests/models/fixtures/pixtral_chat_engine.json
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -7,26 +7,6 @@ import pytest
 from tests.quantization.utils import is_quant_method_supported
 # In this test we hardcode prompts and generations for the model so we don't
 # need to require the AQLM package as a dependency
 example_prompts = [
    'vLLM is a high-throughput and memory-efficient inference and serving '
    'engine for LLMs.\n',
    'Briefly describe the major milestones in the development of artificial '
    'intelligence from 1950 to 2020.\n',
    'Compare and contrast artificial intelligence with human intelligence in '
    'terms of processing information.\n',
    'Describe the basic components of a neural network and how it can be '
    'trained.\n',
    'Write a short story about a robot that dreams for the first time.\n',
    'Analyze the impact of the COVID-19 pandemic on global economic structures '
    'and future business models.\n',
    'Explain the cultural significance of the Mona Lisa painting, and how its '
    'perception might vary in Western versus Eastern societies.\n',
    "Translate the following English sentence into Japanese, French, and "
    "Swahili: 'The early bird catches the worm.'\n"
 ]
 # These ground truth generations were generated using `transformers==4.38.1
 # aqlm==1.1.0 torch==2.2.0`
 # and the below code:
--- a/tests/models/test_internvl.py
+++ b/tests/models/test_internvl.py
@@ -1,5 +1,5 @@
 import types
-from typing import List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type, Union
 import pytest
 import torch
@@ -9,7 +9,8 @@ from transformers import AutoConfig
 from vllm.multimodal.utils import rescale_image_size
 from vllm.utils import is_cpu
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
                        _ImageAssets)
 from .utils import check_logprobs_close
 pytestmark = pytest.mark.vlm
@@ -20,6 +21,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "cherry_blossom":
    "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
 })
 HF_MULTIIMAGE_IMAGE_PROMPT = "<|im_start|>User\nImage-1: <image>\nImage-2: <image>\nDescribe the two images in detail.<|im_end|>\n<|im_start|>Assistant\n"  # noqa: E501
 models = [
    "OpenGVLab/InternVL2-1B",
@@ -64,13 +66,13 @@ def generate(
 def run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
-    image_assets: _ImageAssets,
+    inputs: List[Tuple[List[str], PromptImageInput]],
    model: str,
    *,
    size_factors: List[float],
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    mm_limit: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
@@ -83,12 +85,6 @@ def run_test(
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
    images = [asset.pil_image for asset in image_assets]
    inputs_per_image = [(
        [prompt for _ in size_factors],
        [rescale_image_size(image, factor) for factor in size_factors],
    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
@@ -110,13 +106,21 @@ def run_test(
            self.max_num = self.config.max_dynamic_patch
            self.image_size = self.vision_config.image_size
-        def __call__(self, text: str, images: Image, **kwargs):
+        def __call__(self, text: str, images: Union[Image, List[Image]],
                     **kwargs):
            from vllm.model_executor.models.internvl import (
                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
-            pixel_values = image_to_pixel_values(
+            images = [images] if isinstance(images, Image) else images
-                images, self.image_size, self.min_num, self.max_num,
+            pixel_values = [
-                self.use_thumbnail).to(self.dtype)
+                image_to_pixel_values(image, self.image_size, self.min_num,
-            num_patches_list = [pixel_values.shape[0]]
+                                      self.max_num,
                                      self.use_thumbnail).to(self.dtype)
                for image in images
            ]
            num_patches_list = [
                pixel_value.shape[0] for pixel_value in pixel_values
            ]
            pixel_values = torch.cat(pixel_values, dim=0)
            for num_patches in num_patches_list:
                context_tokens = IMG_CONTEXT * self.num_image_token \
                    * num_patches
@@ -130,6 +134,7 @@ def run_test(
    with vllm_runner(model,
                     max_model_len=4096,
                     dtype=dtype,
                     limit_mm_per_prompt={"image": mm_limit},
                     tensor_parallel_size=tensor_parallel_size,
                     distributed_executor_backend=distributed_executor_backend,
                     enforce_eager=True) as vllm_model:
@@ -138,7 +143,7 @@ def run_test(
                                                max_tokens,
                                                num_logprobs=num_logprobs,
                                                images=images)
-            for prompts, images in inputs_per_image
+            for prompts, images in inputs
        ]
    with hf_runner(model, dtype=dtype) as hf_model:
@@ -156,7 +161,7 @@ def run_test(
                                                    num_logprobs=num_logprobs,
                                                    images=hf_images,
                                                    eos_token_id=eos_token_id)
-            for prompts, hf_images in inputs_per_image
+            for prompts, hf_images in inputs
        ]
    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
@@ -264,19 +269,103 @@ if is_cpu():
@torch.inference_mode()
 def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
                dtype: str, max_tokens: int, num_logprobs: int) -> None:
    images = [asset.pil_image for asset in image_assets]
    inputs_per_image = [(
        [prompt for _ in size_factors],
        [rescale_image_size(image, factor) for factor in size_factors],
    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
    run_test(
        hf_runner,
        vllm_runner,
-        image_assets,
+        inputs_per_image,
        model,
        size_factors=size_factors,
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        mm_limit=1,
        tensor_parallel_size=1,
    )
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
    "size_factors",
    [
        # No image
        [],
        # Single-scale
        [1.0],
        # Single-scale, batched
        [1.0, 1.0, 1.0],
        # Multi-scale
        [0.5, 0.75, 1.0],
    ],
 )
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@torch.inference_mode()
 def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
                             size_factors, dtype: str, max_tokens: int,
                             num_logprobs: int) -> None:
    images = [asset.pil_image for asset in image_assets]
    inputs_per_case = [
        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
         [[rescale_image_size(image, factor) for image in images]
          for factor in size_factors])
    ]
    run_test(
        hf_runner,
        vllm_runner,
        inputs_per_case,
        model,
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        mm_limit=2,
        tensor_parallel_size=1,
    )
@pytest.mark.parametrize("model", ["OpenGVLab/InternVL2-2B"])
@pytest.mark.parametrize("size_factors", [[0.5, 1.0]])
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@torch.inference_mode()
 def test_different_num_patches(hf_runner, vllm_runner, image_assets, model,
                               size_factors, dtype: str, max_tokens: int,
                               num_logprobs: int) -> None:
    images = [asset.pil_image.resize((896, 896)) for asset in image_assets]
    inputs_batching = [(
        [prompt for _ in size_factors],
        [rescale_image_size(image, factor) for factor in size_factors],
    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
    inputs_multi_images = [
        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
         [[rescale_image_size(image, factor) for image in images]
          for factor in size_factors])
    ]
    for inputs in [inputs_batching, inputs_multi_images]:
        run_test(
            hf_runner,
            vllm_runner,
            inputs,
            model,
            dtype=dtype,
            max_tokens=max_tokens,
            num_logprobs=num_logprobs,
            mm_limit=2,
            tensor_parallel_size=1,
        )
@pytest.mark.parametrize(
    "models", [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")])
@pytest.mark.parametrize(
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type, overload
 import pytest
 from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
@@ -8,11 +8,14 @@ from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
                        _ImageAssets)
 from .utils import check_logprobs_close
 pytestmark = pytest.mark.vlm
 _LIMIT_IMAGE_PER_PROMPT = 4
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
    "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
@@ -52,6 +55,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
    return hf_output_ids, hf_output_str, out_logprobs
@overload
 def run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
@@ -64,6 +68,78 @@ def run_test(
    num_logprobs: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
    ...
@overload
 def run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
    image_assets: _ImageAssets,
    model: str,
    *,
    sizes: List[Tuple[int, int]],
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
    ...
 def run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
    image_assets: _ImageAssets,
    model: str,
    *,
    size_factors: Optional[List[float]] = None,
    sizes: Optional[List[Tuple[int, int]]] = None,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
    images = [asset.pil_image for asset in image_assets]
    if size_factors is not None:
        inputs_per_image = [(
            [prompt for _ in size_factors],
            [rescale_image_size(image, factor) for factor in size_factors],
        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
    elif sizes is not None:
        inputs_per_image = [(
            [prompt for _ in sizes],
            [image.resize(size) for size in sizes],
        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
    else:
        raise ValueError("You must provide either `size_factors` or `sizes`")
    _run_test(hf_runner,
              vllm_runner,
              inputs_per_image,
              model,
              dtype=dtype,
              max_tokens=max_tokens,
              num_logprobs=num_logprobs,
              tensor_parallel_size=tensor_parallel_size,
              distributed_executor_backend=distributed_executor_backend)
 def _run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
    inputs: List[Tuple[List[str], PromptImageInput]],
    model: str,
    *,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
    """Inference result should be the same between hf and vllm.
@@ -85,13 +161,6 @@ def run_test(
    else:
        mantis_processor = None
    images = [asset.pil_image for asset in image_assets]
    inputs_per_image = [(
        [prompt for _ in size_factors],
        [rescale_image_size(image, factor) for factor in size_factors],
    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
@@ -100,15 +169,18 @@ def run_test(
    # max_model_len should be greater than image_feature_size
    with vllm_runner(model,
                     dtype=dtype,
                     max_model_len=4096,
                     tensor_parallel_size=tensor_parallel_size,
                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
+                     enforce_eager=True,
                     limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
                                          }) as vllm_model:
        vllm_outputs_per_image = [
            vllm_model.generate_greedy_logprobs(prompts,
                                                max_tokens,
                                                num_logprobs=num_logprobs,
                                                images=images)
-            for prompts, images in inputs_per_image
+            for prompts, images in inputs
        ]
    if mantis_processor is not None:
@@ -131,7 +203,7 @@ def run_test(
                                                    max_tokens,
                                                    num_logprobs=num_logprobs,
                                                    images=images)
-            for prompts, images in inputs_per_image
+            for prompts, images in inputs
        ]
    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
@@ -181,6 +253,51 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
    )
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
 def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets,
                                      model, dtype, max_tokens,
                                      num_logprobs) -> None:
    stop_sign = image_assets[0].pil_image
    cherry_blossom = image_assets[1].pil_image
    inputs = [(
        [
            "USER: <image><image>\nDescribe 2 images.\nASSISTANT:",
            "USER: <image><image>\nDescribe 2 images.\nASSISTANT:",
            "USER: <image><image><image><image>\nDescribe 4 images.\nASSISTANT:",  # noqa: E501
            "USER: <image>\nWhat is the season?\nASSISTANT:",
        ],
        [
            [stop_sign, cherry_blossom],
            # Images with different sizes and aspect-ratios
            [
                rescale_image_size(stop_sign, 0.1),
                stop_sign,
            ],
            [
                stop_sign,
                rescale_image_size(stop_sign, 0.25),
                cherry_blossom.resize((183, 488)),
                cherry_blossom.resize((488, 183))
            ],
            cherry_blossom,
        ])]
    _run_test(
        hf_runner,
        vllm_runner,
        inputs,
        model,
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        tensor_parallel_size=1,
    )
@pytest.mark.parametrize("model", models)
 def test_context_length_too_short(vllm_runner, image_assets, model):
    images = [asset.pil_image for asset in image_assets]
--- a/tests/models/test_llava_next_video.py
+++ b/tests/models/test_llava_next_video.py
@@ -0,0 +1,236 @@
 from typing import List, Optional, Tuple, Type, overload
 import pytest
 import transformers
 from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
 from vllm.multimodal.utils import (rescale_video_size, resize_video,
                                   sample_frames_from_video)
 from vllm.sequence import SampleLogprobs
 from ..conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
 from .utils import check_logprobs_close
 pytestmark = pytest.mark.vlm
 _PREFACE = (
    "A chat between a curious human and an artificial intelligence assistant. "
    "The assistant gives helpful, detailed, and polite answers to the human's "
    "questions.")
 HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
    "sample_demo_1":
    f"{_PREFACE}USER: <video>\nWhy is this video funny? ASSISTANT:"
 })
 models = ["llava-hf/LLaVA-NeXT-Video-7B-hf"]
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
                                         Optional[SampleLogprobs]],
                      model: str):
    """Sanitize vllm output to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output
    config = AutoConfig.from_pretrained(model)
    video_token_id = config.video_token_index
    tokenizer = AutoTokenizer.from_pretrained(model)
    eos_token_id = tokenizer.eos_token_id
    hf_output_ids = [
        token_id for idx, token_id in enumerate(output_ids)
        if token_id != video_token_id or output_ids[idx - 1] != video_token_id
    ]
    assert output_str[0] == " "
    hf_output_str = output_str[1:]
    if hf_output_ids[-1] == eos_token_id:
        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
    return hf_output_ids, hf_output_str, out_logprobs
@overload
 def run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
    video_assets: _VideoAssets,
    model: str,
    *,
    size_factors: List[float],
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    num_frames: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
    ...
@overload
 def run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
    video_assets: _VideoAssets,
    model: str,
    *,
    sizes: List[Tuple[int, int]],
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    num_frames: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
    ...
 def run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
    video_assets: _VideoAssets,
    model: str,
    *,
    size_factors: Optional[List[float]] = None,
    sizes: Optional[List[Tuple[int, int]]] = None,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    num_frames: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
    videos = [
        sample_frames_from_video(asset.np_ndarrays, num_frames)
        for asset in video_assets
    ]
    for video in videos:
        print(video.shape)
    if size_factors is not None:
        inputs_per_video = [(
            [prompt for _ in size_factors],
            [rescale_video_size(video, factor) for factor in size_factors],
        ) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
    elif sizes is not None:
        inputs_per_video = [(
            [prompt for _ in sizes],
            [resize_video(video, size) for size in sizes],
        ) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
    else:
        raise ValueError("You must provide either `size_factors` or `sizes`")
    # max_model_len should be greater than image_feature_size
    with vllm_runner(model,
                     dtype=dtype,
                     max_model_len=4096,
                     tensor_parallel_size=tensor_parallel_size,
                     distributed_executor_backend=distributed_executor_backend,
                     enforce_eager=True) as vllm_model:
        vllm_outputs_per_video = [
            vllm_model.generate_greedy_logprobs(prompts,
                                                max_tokens,
                                                num_logprobs=num_logprobs,
                                                videos=videos)
            for prompts, videos in inputs_per_video
        ]
    with hf_runner(model, dtype=dtype,
                   auto_cls=AutoModelForVision2Seq) as hf_model:
        hf_outputs_per_video = [
            hf_model.generate_greedy_logprobs_limit(prompts,
                                                    max_tokens,
                                                    num_logprobs=num_logprobs,
                                                    videos=videos)
            for prompts, videos in inputs_per_video
        ]
    for hf_outputs, vllm_outputs in zip(hf_outputs_per_video,
                                        vllm_outputs_per_video):
        # TODO: Check whether using original CLIPVisionModel can improve
        # consistency against HF
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=[
                vllm_to_hf_output(vllm_output, model)
                for vllm_output in vllm_outputs
            ],
            name_0="hf",
            name_1="vllm",
        )
@pytest.mark.skipif(transformers.__version__ < "4.45",
                    reason="Waiting for next transformers release")
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
    "size_factors",
    [
        # No video
        [],
        # Single-scale
        [1.0],
        # Single-scale, batched
        [1.0, 1.0, 1.0],
        # Multi-scale
        [0.25, 0.5, 1.0],
    ],
 )
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("num_frames", [16])
 def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
                dtype, max_tokens, num_logprobs, num_frames) -> None:
    """Inference result should be the same between hf and vllm.
    All the image fixtures for the test is under tests/videos.
    For huggingface runner, we provide the np.ndarray as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding MultiModalConfig as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
    run_test(
        hf_runner,
        vllm_runner,
        video_assets,
        model,
        size_factors=size_factors,
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        num_frames=num_frames,
        tensor_parallel_size=1,
    )
@pytest.mark.skipif(transformers.__version__ < "4.45",
                    reason="Waiting for next transformers release")
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
    "sizes",
    [[(1669, 2560), (2560, 1669), (183, 488), (488, 183)]],
 )
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("num_frames", [16])
 def test_models_fixed_sizes(hf_runner, vllm_runner, video_assets, model, sizes,
                            dtype, max_tokens, num_logprobs,
                            num_frames) -> None:
    run_test(
        hf_runner,
        vllm_runner,
        video_assets,
        model,
        sizes=sizes,
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        num_frames=num_frames,
        tensor_parallel_size=1,
    )
--- a/tests/models/test_mistral.py
+++ b/tests/models/test_mistral.py
@@ -41,3 +41,43 @@ def test_models(
        name_0="hf",
        name_1="vllm",
    )
@pytest.mark.parametrize("model", MODELS[1:])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
 def test_mistral_format(
    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
    with vllm_runner(
            model,
            dtype=dtype,
            tokenizer_mode="auto",
            load_format="safetensors",
            config_format="hf",
    ) as hf_format_model:
        hf_format_outputs = hf_format_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)
    with vllm_runner(
            model,
            dtype=dtype,
            tokenizer_mode="mistral",
            load_format="mistral",
            config_format="mistral",
    ) as mistral_format_model:
        mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)
    check_logprobs_close(
        outputs_0_lst=hf_format_outputs,
        outputs_1_lst=mistral_format_outputs,
        name_0="hf",
        name_1="mistral",
    )
--- a/tests/models/test_modelopt.py
+++ b/tests/models/test_modelopt.py
@@ -0,0 +1,79 @@
 # flake8: noqa
 """Tests Model Optimizer fp8 models against ground truth generation
 Note: these tests will only pass on H100
 """
 import os
 from typing import List
 import pytest
 from transformers import AutoTokenizer
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 MAX_MODEL_LEN = 1024
 MODELS = ["nvidia/Llama-3.1-8B-Instruct-FP8"]
 EXPECTED_STRS_MAP = {
    "nvidia/Llama-3.1-8B-Instruct-FP8": [
        "You're referring to VLLM, a high-performance Large Language Model (LLM) inference and",
        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
        'The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and',
        'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
        '**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir',
        'The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to',
        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
        'Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる'
    ]
 }
 # This test compares against golden strings for exact match since
 # there is no baseline implementation to compare against
 # and is unstable w.r.t specifics of the fp8 implementation or
 # the hardware being run on.
 # Disabled to prevent it from breaking the build
@pytest.mark.skip(
    reason=
    "Prevent unstable test based on golden strings from breaking the build.")
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
                    reason="fp8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS)
 def test_models(example_prompts, model_name) -> None:
    model = LLM(
        model=model_name,
        max_model_len=MAX_MODEL_LEN,
        trust_remote_code=True,
        enforce_eager=True,
        quantization="modelopt",
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    formatted_prompts = [
        tokenizer.apply_chat_template([{
            "role": "user",
            "content": prompt
        }],
                                      tokenize=False,
                                      add_generation_prompt=True)
        for prompt in example_prompts
    ]
    params = SamplingParams(max_tokens=20, temperature=0)
    generations: List[str] = []
    # Note: these need to be run 1 at a time due to numerical precision,
    # since the expected strs were generated this way.
    for prompt in formatted_prompts:
        outputs = model.generate(prompt, params)
        generations.append(outputs[0].outputs[0].text)
    del model
    print(model_name, generations)
    expected_strs = EXPECTED_STRS_MAP[model_name]
    for i in range(len(example_prompts)):
        generated_str = generations[i]
        expected_str = expected_strs[i]
        assert expected_str == generated_str, (
            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -1,16 +1,15 @@
 import os
 import re
-from typing import List, Optional, Tuple, Type, Union
+from typing import List, Optional, Tuple, Type
 import pytest
 from PIL import Image
 from transformers import AutoTokenizer
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 from vllm.utils import is_cpu, is_hip
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner
+from ..conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from .utils import check_logprobs_close
 pytestmark = pytest.mark.vlm
@@ -60,8 +59,7 @@ if is_hip():
 def run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
-    inputs: List[Tuple[List[str], Union[List[Image.Image],
+    inputs: List[Tuple[List[str], PromptImageInput]],
                                        List[List[Image.Image]]]]],
    model: str,
    *,
    dtype: str,
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Simon Mo	acda0b35d0	bump version to v0.6.1.post1 (#8440 ) Some checks failed Create Release / Create Release (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.10, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.11, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.12, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.8, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.9, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.10, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.11, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.12, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.8, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.9, 2.4.0) (push) Has been cancelled Details	2024-09-12 21:39:49 -07:00
William Lin	ba77527955	[bugfix] torch profiler bug for single gpu with GPUExecutor (#8354 )	2024-09-12 21:30:00 -07:00
Alexander Matveev	6821020109	[Bugfix] Fix async log stats (#8417 )	2024-09-12 20:48:59 -07:00
Cyrus Leung	8427550488	[CI/Build] Update pixtral tests to use JSON (#8436 )	2024-09-13 03:47:52 +00:00
Cyrus Leung	3f79bc3d1a	[Bugfix] Bump fastapi and pydantic version (#8435 )	2024-09-13 03:21:42 +00:00
shangmingc	40c396533d	[Bugfix] Mapping physical device indices for e2e test utils (#8290 )	2024-09-13 11:06:28 +08:00
Cyrus Leung	5ec9c0fb3c	[Core] Factor out input preprocessing to a separate class (#7329 )	2024-09-13 02:56:13 +00:00
Dipika Sikka	8f44a92d85	[BugFix] fix group_topk (#8430 )	2024-09-13 09:23:42 +08:00
Roger Wang	360ddbd37e	[Misc] Update Pixtral example (#8431 )	2024-09-12 17:31:18 -07:00
Wenxiang	a480939e8e	[Bugfix] Fix weight loading issue by rename variable. (#8293 )	2024-09-12 19:25:00 -04:00
Patrick von Platen	d31174a4e1	[Hotfix][Pixtral] Fix multiple images bugs (#8415 )	2024-09-12 15:21:51 -07:00
Roger Wang	b61bd98f90	[CI/Build] Disable multi-node test for InternVL2 (#8428 )	2024-09-12 15:05:35 -07:00
Roger Wang	c16369455f	[Hotfix][Core][VLM] Disable chunked prefill by default and prefix caching for multimodal models (#8425 )	2024-09-12 14:06:51 -07:00
Alexander Matveev	019877253b	[Bugfix] multi-step + flashinfer: ensure cuda graph compatible (#8427 )	2024-09-12 21:01:50 +00:00
Nick Hill	551ce01078	[Core] Add engine option to return only deltas or final output (#7381 )	2024-09-12 12:02:00 -07:00
William Lin	a6c0f3658d	[multi-step] add flashinfer backend (#7928 )	2024-09-12 11:16:22 -07:00
Joe Runde	f2e263b801	[Bugfix] Offline mode fix (#8376 ) Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>	2024-09-12 11:11:57 -07:00
Luis Vega	1f0c75afa9	[BugFix] Fix Duplicate Assignment in Hermes2ProToolParser (#8423 )	2024-09-12 11:10:11 -07:00
WANGWEI	8a23e93302	[BugFix] lazy init _copy_stream to avoid torch init wrong gpu instance (#8403 )	2024-09-12 10:47:42 -07:00
Alex Brooks	c6202daeed	[Model] Support multiple images for qwen-vl (#8247 ) Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>	2024-09-12 10:10:54 -07:00
Isotr0py	e56bf27741	[Bugfix] Fix InternVL2 inference with various num_patches (#8375 ) Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>	2024-09-12 10:10:35 -07:00
Roger Wang	520ca380ae	[Hotfix][VLM] Fixing max position embeddings for Pixtral (#8399 )	2024-09-12 09:28:37 -07:00
youkaichao	7de49aa86c	[torch.compile] hide slicing under custom op for inductor (#8384 )	2024-09-12 00:11:55 -07:00
Woosuk Kwon	42ffba11ad	[Misc] Use RoPE cache for MRoPE (#8396 )	2024-09-11 23:13:14 -07:00
Kevin Lin	295c4730a8	[Misc] Raise error when using encoder/decoder model with cpu backend (#8355 )	2024-09-12 05:45:24 +00:00
Blueyo0	1bf2dd9df0	[Gemma2] add bitsandbytes support for Gemma2 (#8338 )	2024-09-11 21:53:12 -07:00
tomeras91	5a60699c45	[Bugfix]: Fix the logic for deciding if tool parsing is used (#8366 )	2024-09-12 03:55:30 +00:00
Michael Goin	b6c75e1cf2	Fix the AMD weight loading tests (#8390 )	2024-09-11 20:35:33 -07:00
Woosuk Kwon	b71c956deb	[TPU] Use Ray for default distributed backend (#8389 )	2024-09-11 20:31:51 -07:00
youkaichao	f842a7aff1	[misc] remove engine_use_ray (#8126 )	2024-09-11 18:23:36 -07:00
Cody Yu	a65cb16067	[MISC] Dump model runner inputs when crashing (#8305 )	2024-09-12 01:12:25 +00:00
Simon Mo	3fd2b0d21c	Bump version to v0.6.1 (#8379 ) Some checks failed Create Release / Create Release (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.10, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.11, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.12, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.8, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.9, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.10, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.11, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.12, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.8, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.9, 2.4.0) (push) Has been cancelled Details	2024-09-11 14:42:11 -07:00
Patrick von Platen	d394787e52	Pixtral (#8377 ) Co-authored-by: Roger Wang <ywang@roblox.com>	2024-09-11 14:41:55 -07:00
Lily Liu	775f00f81e	[Speculative Decoding] Test refactor (#8317 ) Co-authored-by: youkaichao <youkaichao@126.com>	2024-09-11 14:07:34 -07:00
Aarni Koskela	8baa454937	[Misc] Move device options to a single place (#8322 )	2024-09-11 13:25:58 -07:00
bnellnm	73202dbe77	[Kernel][Misc] register ops to prevent graph breaks (#6917 ) Co-authored-by: Sage Moore <sage@neuralmagic.com>	2024-09-11 12:52:19 -07:00
Cyrus Leung	7015417fd4	[Bugfix] Add missing attributes in mistral tokenizer (#8364 )	2024-09-11 11:36:54 -07:00
Alexey Kondratiev(AMD)	aea02f30de	[CI/Build] Excluding test_moe.py from AMD Kernels tests for investigation (#8373 )	2024-09-11 18:31:41 +00:00
Li, Jiang	0b952af458	[Hardware][Intel] Support compressed-tensor W8A8 for CPU backend (#7257 )	2024-09-11 09:46:46 -07:00
Yang Fan	3b7fea770f	[Model][VLM] Add Qwen2-VL model support (#7905 ) Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>	2024-09-11 09:31:19 -07:00
Pooya Davoodi	cea95dfb94	[Frontend] Create ErrorResponse instead of raising exceptions in run_batch (#8347 )	2024-09-11 05:30:11 +00:00
Yangshen⚡Deng	6a512a00df	[model] Support for Llava-Next-Video model (#7559 ) Co-authored-by: Roger Wang <ywang@roblox.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>	2024-09-10 22:21:36 -07:00
Pavani Majety	efcf946a15	[Hardware][NV] Add support for ModelOpt static scaling checkpoints. (#6112 )	2024-09-11 00:38:40 -04:00
Isotr0py	1230263e16	[Bugfix] Fix InternVL2 vision embeddings process with pipeline parallel (#8299 )	2024-09-11 10:11:01 +08:00
Jee Jee Li	e497b8aeff	[Misc] Skip loading extra bias for Qwen2-MOE GPTQ models (#8329 )	2024-09-10 20:59:19 -04:00
Tyler Michael Smith	94144e726c	[CI/Build][Kernel] Update CUTLASS to 3.5.1 tag (#8043 )	2024-09-10 23:51:58 +00:00
William Lin	1d5e397aa4	[Core/Bugfix] pass VLLM_ATTENTION_BACKEND to ray workers (#8172 )	2024-09-10 23:46:08 +00:00
Alexander Matveev	22f3a4bc6c	[Bugfix] lookahead block table with cuda graph max capture (#8340 ) [Bugfix] Ensure multistep lookahead allocation is compatible with cuda graph max capture (#8340)	2024-09-10 16:00:35 -07:00
Cody Yu	b1f3e18958	[MISC] Keep chunked prefill enabled by default with long context when prefix caching is enabled (#8342 )	2024-09-10 22:28:28 +00:00
Prashant Gupta	04e7c4e771	[Misc] remove peft as dependency for prompt models (#8162 )	2024-09-10 17:21:56 -04:00
Kevin Lin	5faedf1b62	[Spec Decode] Move ops.advance_step to flash attn advance_step (#8224 )	2024-09-10 13:18:14 -07:00
sumitd2	02751a7a42	Fix ppc64le buildkite job (#8309 )	2024-09-10 12:58:34 -07:00
Alexey Kondratiev(AMD)	f421f3cefb	[CI/Build] Enabling kernels tests for AMD, ignoring some of then that fail (#8130 )	2024-09-10 11:51:15 -07:00
Cyrus Leung	8c054b7a62	[Frontend] Clean up type annotations for mistral tokenizer (#8314 )	2024-09-10 16:49:11 +00:00
Daniele	6234385f4a	[CI/Build] enable ccache/scccache for HIP builds (#8327 )	2024-09-10 08:55:08 -07:00
Cyrus Leung	da1a844e61	[Bugfix] Fix missing `post_layernorm` in CLIP (#8155 )	2024-09-10 08:22:50 +00:00
Simon Mo	a1d874224d	Add NVIDIA Meetup slides, announce AMD meetup, and add contact info (#8319 )	2024-09-09 23:21:00 -07:00
Dipika Sikka	6cd5e5b07e	[Misc] Fused MoE Marlin support for GPTQ (#8217 )	2024-09-09 23:02:52 -04:00
Kyle Sayers	c7cb5c3335	[Misc] GPTQ Activation Ordering (#8135 )	2024-09-09 16:27:26 -04:00
Vladislav Kruglikov	f9b4a2d415	[Bugfix] Correct adapter usage for cohere and jamba (#8292 )	2024-09-09 11:20:46 -07:00
Adam Lugowski	58fcc8545a	[Frontend] Add progress reporting to run_batch.py (#8060 ) Co-authored-by: Adam Lugowski <adam.lugowski@parasail.io>	2024-09-09 11:16:37 -07:00
Kyle Mistele	08287ef675	[Bugfix] Streamed tool calls now more strictly follow OpenAI's format; ensures Vercel AI SDK compatibility (#8272 )	2024-09-09 10:45:11 -04:00
Alexander Matveev	4ef41b8476	[Bugfix] Fix async postprocessor in case of preemption (#8267 )	2024-09-07 21:01:51 -07:00
Joe Runde	cfe712bf1a	[CI/Build] Use python 3.12 in cuda image (#8133 ) Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>	2024-09-07 13:03:16 -07:00
sumitd2	b962ee1470	ppc64le: Dockerfile fixed, and a script for buildkite (#8026 )	2024-09-07 11:18:40 -07:00
Isotr0py	36bf8150cc	[Model][VLM] Decouple weight loading logic for `Paligemma` (#8269 )	2024-09-07 17:45:44 +00:00
Isotr0py	e807125936	[Model][VLM] Support multi-images inputs for InternVL2 models (#8201 )	2024-09-07 16:38:23 +08:00
Cyrus Leung	9f68e00d27	[Bugfix] Fix broken OpenAI tensorizer test (#8258 )	2024-09-07 08:02:39 +00:00
youkaichao	ce2702a923	[tpu][misc] fix typo (#8260 )	2024-09-06 22:40:46 -07:00
Wei-Sheng Chin	795b662cff	Enable Random Prefix Caching in Serving Profiling Tool (benchmark_serving.py) (#8241 )	2024-09-06 20:18:16 -07:00
Cyrus Leung	2f707fcb35	[Model] Multi-input support for LLaVA (#8238 )	2024-09-07 02:57:24 +00:00
Kyle Mistele	41e95c5247	[Bugfix] Fix Hermes tool call chat template bug (#8256 ) Co-authored-by: Kyle Mistele <kyle@constellate.ai>	2024-09-07 10:49:01 +08:00
William Lin	12dd715807	[misc] [doc] [frontend] LLM torch profiler support (#7943 )	2024-09-06 17:48:48 -07:00
Patrick von Platen	29f49cd6e3	[Model] Allow loading from original Mistral format (#8168 ) Co-authored-by: Michael Goin <michael@neuralmagic.com>	2024-09-06 17:02:05 -06:00
Dipika Sikka	23f322297f	[Misc] Remove `SqueezeLLM` (#8220 )	2024-09-06 16:29:03 -06:00
rasmith	9db52eab3d	[Kernel] [Triton] Memory optimization for awq_gemm and awq_dequantize, 2x throughput (#8248 )	2024-09-06 16:26:09 -06:00
Alexey Kondratiev(AMD)	1447c97e75	[CI/Build] Increasing timeout for multiproc worker tests (#8203 )	2024-09-06 11:51:03 -07:00
Rui Qiao	de80783b69	[Misc] Use ray[adag] dependency instead of cuda (#7938 )	2024-09-06 09:18:35 -07:00
afeldman-nm	e5cab71531	[Frontend] Add --logprobs argument to `benchmark_serving.py` (#8191 )	2024-09-06 09:01:14 -07:00
Nick Hill	baa5467547	[BugFix] Fix Granite model configuration (#8216 )	2024-09-06 11:39:29 +08:00
Jiaxin Shan	db3bf7c991	[Core] Support load and unload LoRA in api server (#6566 ) Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>	2024-09-05 18:10:33 -07:00
sroy745	2febcf2777	[Documentation][Spec Decode] Add documentation about lossless guarantees in Speculative Decoding in vLLM (#7962 )	2024-09-05 16:25:29 -04:00
Michael Goin	2ee45281a5	Move verify_marlin_supported to GPTQMarlinLinearMethod (#8165 )	2024-09-05 11:09:46 -04:00
Alex Brooks	9da25a88aa	[MODEL] Qwen Multimodal Support (Qwen-VL / Qwen-VL-Chat) (#8029 ) Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>	2024-09-05 12:48:10 +00:00
manikandan.tm@zucisystems.com	8685ba1a1e	Inclusion of InternVLChatModel In PP_SUPPORTED_MODELS(Pipeline Parallelism) (#7860 )	2024-09-05 11:33:37 +00:00
Cyrus Leung	288a938872	[Doc] Indicate more information about supported modalities (#8181 )	2024-09-05 10:51:53 +00:00
Elfie Guo	e39ebf5cf5	[Core/Bugfix] Add query dtype as per FlashInfer API requirements. (#8173 )	2024-09-05 05:12:26 +00:00
Kevin H. Luu	ba262c4e5a	[ci] Mark LoRA test as soft-fail (#8160 ) Signed-off-by: kevin <kevin@anyscale.com>	2024-09-04 20:33:12 -07:00
Woosuk Kwon	4624d98dbd	[Misc] Clean up RoPE forward_native (#8076 )	2024-09-04 20:31:48 -07:00
William Lin	1afc931987	[bugfix] >1.43 constraint for openai (#8169 ) Co-authored-by: Michael Goin <michael@neuralmagic.com>	2024-09-04 17:35:36 -07:00
Maureen McElaney	e01c2beb7d	[Doc] [Misc] Create CODE_OF_CONDUCT.md (#8161 )	2024-09-04 16:50:13 -07:00