[Core][V0] Enable regex support with xgrammar (#13228 )

Signed-off-by: Russell Bryant <rbryant@redhat.com>
[Benchmark][Bugfix] Fix SonnetDataset default values in benchmark_throughput.py (#16556 )
2025-04-14 10:13:38 +08:00 · 2025-04-13 17:20:26 -07:00 · 2025-04-13 14:55:18 +00:00 · 2025-04-12 19:42:51 -07:00 · 2025-04-12 20:19:19 +00:00 · 2025-04-12 22:54:37 +08:00
1421 changed files with 121775 additions and 34389 deletions
--- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
@@ -4,8 +4,8 @@ tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
-    value: 0.233
+    value: 0.231
  - name: "exact_match,flexible-extract"
-    value: 0.236
+    value: 0.22
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
@@ -0,0 +1,11 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
 model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.31
  - name: "exact_match,flexible-extract"
    value: 0.47
 limit: 1319
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -4,7 +4,7 @@ Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
-Minitron-4B-Base-FP8.yaml
+Qwen1.5-MoE-W4A16-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-FP8W8.yaml
 Meta-Llama-3-8B-QQQ.yaml
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -13,6 +13,7 @@ from pathlib import Path
 import lm_eval
 import numpy
 import pytest
 import yaml
 RTOL = 0.05
@@ -46,6 +47,10 @@ def test_lm_eval_correctness():
    eval_config = yaml.safe_load(
        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
    if eval_config[
            "model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform":  #noqa: E501
        pytest.skip("FBGEMM is currently failing on main.")
    # Launch eval requests.
    results = launch_lm_eval(eval_config)
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -84,8 +84,13 @@ if __name__ == "__main__":
            # this result is generated via `benchmark_serving.py`
            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands")) as f:
+            try:
-                command = json.loads(f.read())
+                with open(test_file.with_suffix(".commands")) as f:
                    command = json.loads(f.read())
            except OSError as e:
                print(e)
                continue
            raw_result.update(command)
            # update the test name of this result
@@ -99,8 +104,13 @@ if __name__ == "__main__":
            # this result is generated via `benchmark_latency.py`
            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands")) as f:
+            try:
-                command = json.loads(f.read())
+                with open(test_file.with_suffix(".commands")) as f:
                    command = json.loads(f.read())
            except OSError as e:
                print(e)
                continue
            raw_result.update(command)
            # update the test name of this result
@@ -121,8 +131,13 @@ if __name__ == "__main__":
            # this result is generated via `benchmark_throughput.py`
            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands")) as f:
+            try:
-                command = json.loads(f.read())
+                with open(test_file.with_suffix(".commands")) as f:
                    command = json.loads(f.read())
            except OSError as e:
                print(e)
                continue
            raw_result.update(command)
            # update the test name of this result
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -426,7 +426,7 @@ main() {
  pip install -U transformers
-  pip install -r requirements-dev.txt
+  pip install -r requirements/dev.txt
  which genai-perf
  # check storage
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -10,15 +10,24 @@ set -x
 set -o pipefail
 check_gpus() {
-  # check the number of GPUs and GPU type.
+  if command -v nvidia-smi; then
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+    # check the number of GPUs and GPU type.
    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  elif command -v amd-smi; then
    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
  fi
  if [[ $gpu_count -gt 0 ]]; then
    echo "GPU found."
  else
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
-  declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
+  if command -v nvidia-smi; then
    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
  elif command -v amd-smi; then
    declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
  fi
  echo "GPU type is $gpu_type"
 }
@@ -90,9 +99,15 @@ kill_gpu_processes() {
  # wait until GPU memory usage smaller than 1GB
-  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
+  if command -v nvidia-smi; then
-    sleep 1
+    while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
-  done
+      sleep 1
    done
  elif command -v amd-smi; then
    while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
      sleep 1
    done
  fi
  # remove vllm config file
  rm -rf ~/.config/vllm
@@ -309,11 +324,14 @@ run_serving_tests() {
      new_test_name=$test_name"_qps_"$qps
      # pass the tensor parallel size to the client so that it can be displayed
      # on the benchmark dashboard
      client_command="python3 benchmark_serving.py \
        --save-result \
        --result-dir $RESULTS_FOLDER \
        --result-filename ${new_test_name}.json \
        --request-rate $qps \
        --metadata "tensor_parallel_size=$tp" \
        $client_args"
      echo "Running test case $test_name with qps $qps"
@@ -358,7 +376,7 @@ main() {
  # get the current IP address, required by benchmark_serving.py
  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
  # turn of the reporting of the status of each request, to clean up the terminal output
-  export VLLM_LOG_LEVEL="WARNING"
+  export VLLM_LOGGING_LEVEL="WARNING"
  # prepare for benchmarking
  cd benchmarks || exit 1
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@@ -64,9 +64,11 @@
            "disable_log_requests": "", 
            "tensor_parallel_size": 4,
            "swap_space": 16,
-            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
+            "speculative_config": {
-            "num_speculative_tokens": 4,
+                "model": "turboderp/Qwama-0.5B-Instruct",
-            "speculative_draft_tensor_parallel_size": 1
+                "num_speculative_tokens": 4,
                "draft_tensor_parallel_size": 1
            }
        },
        "client_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,12 +1,23 @@
 steps:
  - label: "Build wheel - CUDA 12.4"
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
  - label: "Build wheel - CUDA 12.1"
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
@@ -20,10 +31,10 @@ steps:
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
@@ -37,7 +48,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
  - label: "Build and publish TPU release image"
@@ -46,7 +57,7 @@ steps:
    agents:
      queue: tpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
      - "docker push vllm/vllm-tpu:nightly"
      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
    plugins:
@@ -71,7 +82,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@@ -1,16 +0,0 @@
 #!/bin/bash
 # This script build the OpenVINO docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Try building the docker image
 docker build -t openvino-test -f Dockerfile.openvino .
 # Setup cleanup
 remove_docker_container() { docker rm -f openvino-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 # Run the image and launch offline inference
 docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@@ -1,26 +0,0 @@
 #!/bin/bash
 set -e
 # Build the docker image.
 docker build -f Dockerfile.tpu -t vllm-tpu .
 # Set up cleanup.
 remove_docker_container() { docker rm -f tpu-test || true; }
 trap remove_docker_container EXIT
 # Remove the container that might not be cleaned up in the previous run.
 remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
 docker run --privileged --net host --shm-size=16G -it \
    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
    && python3 -m pip install pytest \
    && python3 -m pip install lm_eval[api]==0.4.4 \
    && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \
    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@@ -1,19 +0,0 @@
 #!/bin/bash
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Try building the docker image
 docker build -t xpu-test -f Dockerfile.xpu .
 # Setup cleanup
 remove_docker_container() { docker rm -f xpu-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 # Run the image and test offline inference/tensor parallel
 docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
 '
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -77,7 +77,6 @@ echo "Commands:$commands"
 #ignore certain kernels tests
 if [[ $commands == *" kernels "* ]]; then
  commands="${commands} \
  --ignore=kernels/test_attention.py \
  --ignore=kernels/test_attention_selector.py \
  --ignore=kernels/test_blocksparse_attention.py \
  --ignore=kernels/test_causal_conv1d.py \
@@ -92,19 +91,54 @@ if [[ $commands == *" kernels "* ]]; then
  --ignore=kernels/test_moe.py \
  --ignore=kernels/test_prefix_prefill.py \
  --ignore=kernels/test_rand.py \
-  --ignore=kernels/test_sampler.py"
+  --ignore=kernels/test_sampler.py \
  --ignore=kernels/test_cascade_flash_attn.py \
  --ignore=kernels/test_mamba_mixer2.py \
  --ignore=kernels/test_aqlm.py \
  --ignore=kernels/test_machete_mm.py \
  --ignore=kernels/test_mha_attn.py \
  --ignore=kernels/test_block_fp8.py \
  --ignore=kernels/test_permute_cols.py"
 fi
-#ignore certain Entrypoints tests
+#ignore certain Entrypoints/openai tests
 if [[ $commands == *" entrypoints/openai "* ]]; then
  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
  --ignore=entrypoints/openai/test_accuracy.py \
  --ignore=entrypoints/openai/test_audio.py \
-  --ignore=entrypoints/openai/test_encoder_decoder.py \
+  --ignore=entrypoints/openai/test_shutdown.py \
-  --ignore=entrypoints/openai/test_embedding.py \
+  --ignore=entrypoints/openai/test_completion.py \
-  --ignore=entrypoints/openai/test_oot_registration.py "}
+  --ignore=entrypoints/openai/test_sleep.py \
  --ignore=entrypoints/openai/test_models.py \
  --ignore=entrypoints/openai/test_lora_adapters.py \
  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
  --ignore=entrypoints/openai/test_root_path.py \
  --ignore=entrypoints/openai/test_tokenization.py \
  --ignore=entrypoints/openai/test_prompt_validation.py "}
 fi
 #ignore certain Entrypoints/llm tests
 if [[ $commands == *" entrypoints/llm "* ]]; then
  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
  --ignore=entrypoints/llm/test_chat.py \
  --ignore=entrypoints/llm/test_accuracy.py \
  --ignore=entrypoints/llm/test_init.py \
  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi
 #Obsolete currently
 ##ignore certain Entrypoints/llm tests
 #if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
 #  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
 #fi
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
 # --ignore=entrypoints/openai/test_accuracy.py \
 # --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
 PARALLEL_JOB_COUNT=8
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
@@ -114,9 +148,10 @@ if [[ $commands == *"--shard-id="* ]]; then
    # assign shard-id for each shard
    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
    echo "Shard ${GPU} commands:$commands_gpu"
    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
    docker run \
-        --device /dev/kfd --device /dev/dri \
+        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
-        --network host \
+        --network=host \
        --shm-size=16gb \
        --rm \
        -e HIP_VISIBLE_DEVICES="${GPU}" \
@@ -143,9 +178,10 @@ if [[ $commands == *"--shard-id="* ]]; then
    fi
  done
 else
  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
  docker run \
-          --device /dev/kfd --device /dev/dri \
+          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
-          --network host \
+          --network=host \
          --shm-size=16gb \
          --rm \
          -e HIP_VISIBLE_DEVICES=0 \
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -10,5 +10,5 @@ trap remove_docker_container EXIT
 remove_docker_container
 # Try building the docker image
-docker build -t cpu-test -f Dockerfile.ppc64le .
+docker build -t cpu-test -f docker/Dockerfile.ppc64le .
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -8,24 +8,29 @@ set -ex
 CORE_RANGE=${CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 # Try building the docker image
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
 # Setup cleanup
-remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
+remove_docker_container() { 
    set -e; 
    docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; 
    docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; 
 }
 trap remove_docker_container EXIT
 remove_docker_container
 # Try building the docker image
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
- --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
+ --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
- --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
+ --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
 function cpu_tests() {
  set -e
  export NUMA_NODE=$2
  export BUILDKITE_BUILD_NUMBER=$3
  # offline inference
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
@@ -35,7 +40,8 @@ function cpu_tests() {
  # Run basic model test
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
-    pip install -r vllm/requirements-test.txt
+    pytest -v -s tests/kernels/test_cache.py -m cpu_model
    pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
    pytest -v -s tests/models/decoder_only/language -m cpu_model
    pytest -v -s tests/models/embedding/language -m cpu_model
    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
@@ -85,4 +91,4 @@ function cpu_tests() {
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@@ -9,11 +9,13 @@ python3 use_existing_torch.py
 # Try building the docker image
 DOCKER_BUILDKIT=1 docker build . \
  --file docker/Dockerfile \
  --target vllm-openai \
  --platform "linux/arm64" \
  -t gh200-test \
  --build-arg max_jobs=66 \
  --build-arg nvcc_threads=2 \
  --build-arg RUN_WHEEL_CHECK=false \
  --build-arg torch_cuda_arch_list="9.0+PTX" \
  --build-arg vllm_fa_cmake_gpu_arches="90-real"
@@ -23,6 +25,6 @@ trap remove_docker_container EXIT
 remove_docker_container
 # Run the image and test offline inference
-docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
+docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
 '
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -5,7 +5,7 @@
 set -ex
 # Try building the docker image
-docker build -t hpu-test-env -f Dockerfile.hpu .
+docker build -t hpu-test-env -f docker/Dockerfile.hpu .
 # Setup cleanup
 # certain versions of HPU software stack have a bug that can
--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
@@ -35,7 +35,7 @@ else
    date "+%s" > /tmp/neuron-docker-build-timestamp
 fi
-docker build -t "${image_name}" -f Dockerfile.neuron .
+docker build -t "${image_name}" -f docker/Dockerfile.neuron .
 # Setup cleanup
 remove_docker_container() {
@@ -44,11 +44,11 @@ remove_docker_container() {
 trap remove_docker_container EXIT
 # Run the image
-docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
+docker run --rm -it --device=/dev/neuron0 --network bridge \
       -v "${HF_CACHE}:${HF_MOUNT}" \
       -e "HF_HOME=${HF_MOUNT}" \
       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
       --name "${container_name}" \
       ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -0,0 +1,47 @@
 #!/bin/bash
 set -xue
 # Build the docker image.
 docker build -f docker/Dockerfile.tpu -t vllm-tpu .
 # Set up cleanup.
 remove_docker_container() { docker rm -f tpu-test || true; }
 trap remove_docker_container EXIT
 # Remove the container that might not be cleaned up in the previous run.
 remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
 docker run --privileged --net host --shm-size=16G -it \
    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
    && python3 -m pip install pytest \
    && python3 -m pip install lm_eval[api]==0.4.4 \
    && export VLLM_USE_V1=1 \
    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
    && echo TEST_0 \
    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
    && echo TEST_1 \
    && pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
    && echo TEST_2 \
    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
    && echo TEST_3 \
    && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
    && echo TEST_4 \
    && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
    && echo TEST_5 \
    && python3 /workspace/vllm/examples/offline_inference/tpu.py \
    && echo TEST_6 \
    && pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
    && echo TEST_7 \
    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py \
    && echo TEST_8 \
    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \
    && echo TEST_9 \
    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" \
 # TODO: This test fails because it uses RANDOM_SEED sampling
 # && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -0,0 +1,31 @@
 #!/bin/bash
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 # Try building the docker image
 docker build -t ${image_name} -f docker/Dockerfile.xpu .
 # Setup cleanup
 remove_docker_container() { 
  docker rm -f "${container_name}" || true; 
  docker image rm -f "${image_name}" || true;
  docker system prune -f || true;
 }
 trap remove_docker_container EXIT
 # Run the image and test offline inference/tensor parallel
 docker run \
    --device /dev/dri \
    -v /dev/dri/by-path:/dev/dri/by-path \
    --entrypoint="" \
    --name "${container_name}" \
    "${image_name}" \
    sh -c '
    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
 '
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@@ -5,8 +5,8 @@
 set -ex
 set -o pipefail
-# cd into parent directory of this file
+# cd 2 levels into the working directory
-cd "$(dirname "${BASH_SOURCE[0]}")/.."
+cd "$(dirname "${BASH_SOURCE[0]}")/../.."
 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
--- a/.buildkite/scripts/run-multi-node-test.sh
+++ b/.buildkite/scripts/run-multi-node-test.sh
@@ -3,7 +3,7 @@
 set -euox pipefail
 if [[ $# -lt 4 ]]; then
-    echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
+    echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
    exit 1
 fi
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -50,8 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 if [[ $normal_wheel == *"cu118"* ]]; then
    # if $normal_wheel matches cu118, do not upload the index.html
    echo "Skipping index files for cu118 wheels"
 elif [[ $normal_wheel == *"cu121"* ]]; then
    # if $normal_wheel matches cu121, do not upload the index.html
    echo "Skipping index files for cu121 wheels"
 else
-    # only upload index.html for cu12 wheels (default wheels)
+    # only upload index.html for cu124 wheels (default wheels)
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 fi
@@ -63,8 +66,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 if [[ $normal_wheel == *"cu118"* ]]; then
    # if $normal_wheel matches cu118, do not upload the index.html
    echo "Skipping index files for cu118 wheels"
 elif [[ $normal_wheel == *"cu121"* ]]; then
    # if $normal_wheel matches cu121, do not upload the index.html
    echo "Skipping index files for cu121 wheels"
 else
-    # only upload index.html for cu12 wheels (default wheels)
+    # only upload index.html for cu124 wheels (default wheels)
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 fi
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -35,13 +35,12 @@ steps:
  fast_check: true
  no_gpu: True
  commands:
-  - pip install -r requirements-docs.txt
+  - pip install -r ../../requirements/docs.txt
  - SPHINXOPTS=\"-W\" make html
  # Check API reference (if it fails, you may have missing mock imports)
  - grep \"sig sig-object py\" build/html/api/inference_params.html
 - label: Async Engine, Inputs, Utils, Worker Test # 24min
  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/mq_llm_engine
@@ -78,6 +77,7 @@ steps:
  - tests/basic_correctness/test_preemption
  - tests/basic_correctness/test_cumem.py
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s basic_correctness/test_cumem.py
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
@@ -104,7 +104,7 @@ steps:
 - label: Entrypoints Test # 40min
  working_dir: "/vllm-workspace/tests"
  fast_check: true
-  mirror_hardwares: [amd]
+  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/llm
@@ -112,19 +112,19 @@ steps:
  - tests/entrypoints/test_chat_utils
  - tests/entrypoints/offline_mode
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
-  - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
+  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py  --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
  - pytest -v -s entrypoints/test_chat_utils.py
-  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 - label: Distributed Tests (4 GPUs) # 10min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  fast_check: true
  source_file_dependencies:
  - vllm/distributed/
  - vllm/core/
@@ -134,37 +134,42 @@ steps:
  - tests/compile/test_basic_correctness
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/test_async_llm_dp.py
  commands:
  # test with tp=2 and external_dp=2
  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
-  - python3 ../examples/offline_inference/rlhf.py
+  - pushd ../examples/offline_inference
-  - RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/rlhf_colocate.py
+  - python3 rlhf.py
  - RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd
 - label: Metrics, Tracing Test # 10min
  mirror_hardwares: [amd]
  num_gpus: 2
  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/metrics
  - tests/tracing
  commands:
  - pytest -v -s metrics
  - "pip install \
      'opentelemetry-sdk>=1.26.0,<1.27.0' \
      'opentelemetry-api>=1.26.0,<1.27.0' \
      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
  - pytest -v -s tracing
 ##### fast check tests  #####
 #####  1 GPU test  #####
 - label: Regression Test # 5min
-  mirror_hardwares: [amd]
+  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/test_regression
@@ -194,15 +199,18 @@ steps:
    - tests/v1
  commands:
    # split the test to avoid interference
-    - VLLM_USE_V1=1 pytest -v -s v1/core
+    - pytest -v -s v1/core
-    - VLLM_USE_V1=1 pytest -v -s v1/engine
+    - pytest -v -s v1/engine
-    - VLLM_USE_V1=1 pytest -v -s v1/sample
+    - pytest -v -s v1/entrypoints
-    - VLLM_USE_V1=1 pytest -v -s v1/worker
+    - pytest -v -s v1/sample
-    - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
+    - pytest -v -s v1/worker
-    - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
+    - pytest -v -s v1/structured_output
    - pytest -v -s v1/test_stats.py
    - pytest -v -s v1/test_utils.py
    - pytest -v -s v1/test_oracle.py
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-    - VLLM_USE_V1=1 pytest -v -s v1/e2e
+    - pytest -v -s v1/e2e
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
@@ -220,14 +228,17 @@ steps:
    - python3 offline_inference/basic/chat.py
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
-    - python3 offline_inference/vision_language.py
+    - python3 offline_inference/audio_language.py --seed 0
-    - python3 offline_inference/vision_language_multi_image.py
+    - python3 offline_inference/vision_language.py --seed 0
-    - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 offline_inference/vision_language_embedding.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder.py
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
-    - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
+    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 - label: Prefix Caching Test # 9min
  mirror_hardwares: [amd]
@@ -269,15 +280,22 @@ steps:
    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
 - label: LoRA Test %N # 15min each
-  mirror_hardwares: [amd]
+  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/lora
  - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
  parallelism: 4
- label: "PyTorch Fullgraph Smoke Test" # 9min
+- label: PyTorch Compilation Unit Tests
-  fast_check: true
+  source_file_dependencies:
    - vllm/
    - tests/compile
  commands:
    - pytest -v -s compile/test_pass_manager.py
    - pytest -v -s compile/test_fusion.py
 - label: PyTorch Fullgraph Smoke Test # 9min
  source_file_dependencies:
  - vllm/
  - tests/compile
@@ -287,7 +305,7 @@ steps:
  - pytest -v -s compile/piecewise/test_simple.py
  - pytest -v -s compile/piecewise/test_toy_llama.py
- label: "PyTorch Fullgraph Test" # 18min
+- label: PyTorch Fullgraph Test # 18min
  source_file_dependencies:
  - vllm/
  - tests/compile
@@ -295,7 +313,7 @@ steps:
  - pytest -v -s compile/test_full_graph.py
 - label: Kernels Test %N # 1h each
-  mirror_hardwares: [amd]
+  # mirror_hardwares: [amd]
  source_file_dependencies:
  - csrc/
  - vllm/attention
@@ -305,7 +323,7 @@ steps:
  parallelism: 4
 - label: Tensorizer Test # 11min
-  mirror_hardwares: [amd]
+  # mirror_hardwares: [amd]
  soft_fail: true
  source_file_dependencies:
  - vllm/model_executor/model_loader
@@ -321,7 +339,7 @@ steps:
  source_file_dependencies:
  - benchmarks/
  commands:
-  - bash run-benchmarks.sh
+  - bash scripts/run-benchmarks.sh
 - label: Quantization Test # 33min
  source_file_dependencies:
@@ -356,12 +374,14 @@ steps:
 - label: OpenAI-Compatible Tool Use # 20 min
  fast_check: false
-  mirror_hardwares: [ amd ]
+  #mirror_hardwares: [ amd ]
  source_file_dependencies:
    - vllm/
    - tests/tool_use
    - tests/mistral_tool_use
  commands:
    - pytest -v -s tool_use
    - pytest -v -s mistral_tool_use
 #####  models test  #####
@@ -372,7 +392,9 @@ steps:
  commands:
    - pytest -v -s models/test_transformers.py
    - pytest -v -s models/test_registry.py
-    - pytest -v -s models/test_initialization.py
+    # V1 Test: https://github.com/vllm-project/vllm/issues/14531
    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4'
    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
 - label: Language Models Test (Standard) # 32min
  #mirror_hardwares: [amd]
@@ -409,11 +431,12 @@ steps:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal
    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
+    - pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model'
    - pytest -v -s models/embedding/vision_language -m core_model
    - pytest -v -s models/encoder_decoder/audio_language -m core_model
    - pytest -v -s models/encoder_decoder/language -m core_model
    - pytest -v -s models/encoder_decoder/vision_language -m core_model
    - pytest -v -s models/decoder_only/vision_language/test_interleaved.py
 - label: Multi-Modal Models Test (Extended) 1 # 48m
  optional: true
@@ -427,10 +450,7 @@ steps:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
-    # HACK - run phi3v tests separately to sidestep this transformers bug
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
    # https://github.com/huggingface/transformers/issues/34307
    - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
    - pytest -v -s models/embedding/vision_language -m 'not core_model'
    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
@@ -446,6 +466,7 @@ steps:
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
  mirror_hardwares: [amd]
  optional: true
  commands:
    - echo 'Testing custom models...'
@@ -457,6 +478,7 @@ steps:
 #####  multi gpus test  #####
 - label: Distributed Comm Ops Test # 7min
  mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
@@ -499,9 +521,11 @@ steps:
  - vllm/worker/worker.py
  - vllm/worker/model_runner.py
  - entrypoints/llm/test_collective_rpc.py
  - tests/v1/test_async_llm_dp.py
  - vllm/v1/engine/
  commands:
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
  - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
@@ -514,13 +538,12 @@ steps:
  # this test fails consistently.
  # TODO: investigate and fix
  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
+  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
 - label: Plugin Tests (2 GPUs) # 40min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  fast_check: true
  source_file_dependencies:
  - vllm/plugins/
  - tests/plugins/
@@ -579,13 +602,10 @@ steps:
    # FIXIT: find out which code initialize cuda before running the test
    # before the fix, we need to use spawn to test it
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    # This test runs llama 13B, so it is required to run on 4 GPUs.
    - pytest -v -s -x lora/test_long_context.py
    # There is some Tensor Parallelism related processing logic in LoRA that
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
    - pytest -v -s -x lora/test_minicpmv_tp.py
 - label: Weight Loading Multiple GPU Test  # 33min
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -10,27 +10,32 @@
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
-/vllm/model_executor/guided_decoding @mgoin
+/vllm/model_executor/guided_decoding @mgoin @russellb
 /vllm/multimodal @DarkLight1337 @ywang96
 CMakeLists.txt @tlrmchlsmth
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 /vllm/v1/structured_output @mgoin @russellb
 # Test ownership
 /tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
 /tests/models @DarkLight1337 @ywang96
 /tests/multimodal @DarkLight1337 @ywang96
 /tests/prefix_caching @comaniac @KuntaiDu
 /tests/spec_decode @njhill @LiuXiaoxuanPKU
 /tests/kernels @tlrmchlsmth @WoosukKwon
 /tests/quantization @mgoin @robertgshaw2-redhat
 /.buildkite/lm-eval-harness @mgoin @simon-mo
 /tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
 /tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
 /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
 /tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
 /tests/kernels @tlrmchlsmth @WoosukKwon
 /tests/model_executor/test_guided_processors.py @mgoin @russellb
 /tests/models @DarkLight1337 @ywang96
 /tests/multi_step @alexm-redhat @comaniac
 /tests/multimodal @DarkLight1337 @ywang96
 /tests/prefix_caching @comaniac @KuntaiDu
 /tests/quantization @mgoin @robertgshaw2-redhat
 /tests/spec_decode @njhill @LiuXiaoxuanPKU
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
 /tests/v1/structured_output @mgoin @russellb
 /tests/weight_loading @mgoin @youkaichao
 /tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
--- a/.github/ISSUE_TEMPLATE/600-new-model.yml
+++ b/.github/ISSUE_TEMPLATE/600-new-model.yml
@@ -9,7 +9,7 @@ body:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
-      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
+      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
 - type: textarea
  attributes:
    label: The model to consider.
--- a/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
@@ -1,28 +0,0 @@
 name: 🎲 Misc/random discussions that do not fit into the above categories.
 description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
 title: "[Misc]: "
 labels: ["misc"]
 body:
 - type: markdown
  attributes:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
 - type: textarea
  attributes:
    label: Anything you want to discuss about vllm.
    description: >
      Anything you want to discuss about vllm.
  validations:
    required: true
 - type: markdown
  attributes:
    value: >
      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
    label: Before submitting a new issue...
    options:
      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
        required: true
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1 +1,5 @@
 blank_issues_enabled: false
 contact_links:
  - name: Questions
    url: https://discuss.vllm.ai
    about: Ask questions and discuss with other vLLM community members
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -3,4 +3,4 @@ FILL IN THE PR DESCRIPTION HERE
 FIX #xxxx (*link existing issues this PR will resolve*)
 <!--- pyml disable-next-line no-emphasis-as-heading -->
-**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**
+**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>** (anything written below this line will be removed by GitHub Actions)
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -23,7 +23,7 @@ updates:
      - dependency-name: "lm-format-enforcer"
      - dependency-name: "gguf"
      - dependency-name: "compressed-tensors"
-      - dependency-name: "ray[adag]"
+      - dependency-name: "ray[cgraph]" # Ray Compiled Graph
      - dependency-name: "lm-eval"
    groups:
      minor-update:
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -5,6 +5,7 @@ pull_request_rules:
    - or:
      - files~=^[^/]+\.md$
      - files~=^docs/
      - files~=^examples/
  actions:
    label:
      add:
@@ -18,7 +19,7 @@ pull_request_rules:
      - files~=\.buildkite/
      - files~=^cmake/
      - files=CMakeLists.txt
-      - files~=^Dockerfile
+      - files~=^docker/Dockerfile
      - files~=^requirements.*\.txt
      - files=setup.py
  actions:
@@ -35,6 +36,21 @@ pull_request_rules:
      add:
        - frontend
 - name: label-multi-modality
  description: Automatically apply multi-modality label
  conditions:
    - or:
      - files~=^vllm/multimodal/
      - files~=^tests/multimodal/
      - files~=^tests/models/multimodal/
      - files~=^tests/models/*/audio_language/
      - files~=^tests/models/*/vision_language/
      - files=tests/models/test_vision.py
  actions:
    label:
      add:
        - multi-modality
 - name: label-structured-output
  description: Automatically apply structured-output label
  conditions:
@@ -72,6 +88,36 @@ pull_request_rules:
      add:
        - v1
 - name: label-tpu
  description: Automatically apply tpu label
  # Keep this list in sync with `label-tpu-remove` conditions
  conditions:
    - or:
      - files~=tpu.py
      - files~=_tpu
      - files~=tpu_
      - files~=/tpu/
      - files~=pallas
  actions:
    label:
      add:
        - tpu
 - name: label-tpu-remove
  description: Automatically remove tpu label
  # Keep this list in sync with `label-tpu` conditions
  conditions:
    - and:
      - -files~=tpu.py
      - -files~=_tpu
      - -files~=tpu_
      - -files~=/tpu/
      - -files~=pallas
  actions:
    label:
      remove:
        - tpu
 - name: ping author on conflicts and add 'needs-rebase' label
  conditions:
      - conflict
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -12,7 +12,7 @@ jobs:
          fetch-depth: 0
      - name: Set up Helm
-        uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0
+        uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
        with:
          version: v3.14.4
@@ -50,7 +50,7 @@ jobs:
        uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
      - name: Build the Docker image vllm cpu
-        run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
+        run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
      - name: Configuration of docker images, network and namespace for the kind cluster
        run: |
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -39,7 +39,7 @@ jobs:
            const script = require('.github/workflows/scripts/create_release.js')
            await script(github, context, core)
-  # NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow. 
+  # NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow. 
  # wheel:
  #   name: Build Wheel
  #   runs-on: ${{ matrix.os }}
@@ -50,7 +50,7 @@ jobs:
  #     matrix:
  #         os: ['ubuntu-20.04']
  #         python-version: ['3.9', '3.10', '3.11', '3.12']
-  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
+  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements/cuda.txt.
  #         cuda-version: ['11.8', '12.1']
  #   steps:
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@@ -9,7 +9,7 @@ PATH=${cuda_home}/bin:$PATH
 LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
 # Install requirements
-$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
+$python_executable -m pip install -r requirements/build.txt -r requirements/cuda.txt
 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
--- a/.github/workflows/scripts/create_release.js
+++ b/.github/workflows/scripts/create_release.js
@@ -1,4 +1,4 @@
-// Uses Github's API to create the release and wait for result.
+// Uses GitHub's API to create the release and wait for result.
 // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
 module.exports = async (github, context, core) => {
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,8 @@
 /vllm/_version.py
 # vllm-flash-attn built from source
-vllm/vllm_flash_attn/
+vllm/vllm_flash_attn/*
 !vllm/vllm_flash_attn/fa_utils.py
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -197,7 +198,7 @@ _build/
 hip_compat.h
 # Benchmark dataset
-benchmarks/*.json
+benchmarks/**/*.json
 # Linting
 actionlint
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,10 @@
 default_install_hook_types:
  - pre-commit
  - commit-msg
 default_stages:
  - pre-commit # Run locally
  - manual # Run in CI
 exclude: 'vllm/third_party/.*'
 repos:
 - repo: https://github.com/google/yapf
  rev: v0.43.0
@@ -8,13 +12,11 @@ repos:
  - id: yapf
    args: [--in-place, --verbose]
    additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
    exclude: 'vllm/third_party/.*'
 - repo: https://github.com/astral-sh/ruff-pre-commit
  rev: v0.9.3
  hooks:
  - id: ruff
    args: [--output-format, github, --fix]
    exclude: 'vllm/third_party/.*'
 - repo: https://github.com/codespell-project/codespell
  rev: v2.4.0
  hooks:
@@ -22,10 +24,9 @@ repos:
    additional_dependencies: ['tomli']
    args: ['--toml', 'pyproject.toml']
 - repo: https://github.com/PyCQA/isort
-  rev: 5.13.2
+  rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
  hooks:
  - id: isort
    exclude: 'vllm/third_party/.*'
 - repo: https://github.com/pre-commit/mirrors-clang-format
  rev: v19.1.7
  hooks:
@@ -38,12 +39,16 @@ repos:
  hooks:
  - id: pymarkdown
    args: [fix]
    exclude: 'vllm/third_party/.*'
 - repo: https://github.com/rhysd/actionlint
  rev: v1.7.7
  hooks:
  - id: actionlint
-    exclude: 'vllm/third_party/.*'
+- repo: https://github.com/astral-sh/uv-pre-commit
  rev: 0.6.2
  hooks:
    - id: pip-compile
      args: [requirements/test.in, -o, requirements/test.txt]
      files: ^requirements/test\.(in|txt)$
 - repo: local
  hooks:
  - id: mypy-local
@@ -51,9 +56,8 @@ repos:
    entry: tools/mypy.sh 0 "local"
    language: python
    types: [python]
-    additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
+    additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
    stages: [pre-commit] # Don't run in CI
    exclude: 'vllm/third_party/.*'
  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.9
    entry: tools/mypy.sh 1 "3.9"
@@ -61,7 +65,6 @@ repos:
    types: [python]
    additional_dependencies: *mypy_deps
    stages: [manual] # Only run in CI
    exclude: 'vllm/third_party/.*'
  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.10
    entry: tools/mypy.sh 1 "3.10"
@@ -69,7 +72,6 @@ repos:
    types: [python]
    additional_dependencies: *mypy_deps
    stages: [manual] # Only run in CI
    exclude: 'vllm/third_party/.*'
  - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.11
    entry: tools/mypy.sh 1 "3.11"
@@ -77,7 +79,6 @@ repos:
    types: [python]
    additional_dependencies: *mypy_deps
    stages: [manual] # Only run in CI
    exclude: 'vllm/third_party/.*'
  - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.12
    entry: tools/mypy.sh 1 "3.12"
@@ -85,19 +86,16 @@ repos:
    types: [python]
    additional_dependencies: *mypy_deps
    stages: [manual] # Only run in CI
    exclude: 'vllm/third_party/.*'
  - id: shellcheck
    name: Lint shell scripts
    entry: tools/shellcheck.sh
    language: script
    types: [shell]
    exclude: 'vllm/third_party/.*'
  - id: png-lint
    name: Lint PNG exports from excalidraw
    entry: tools/png-lint.sh
    language: script
    types: [png]
    exclude: 'vllm/third_party/.*'
  - id: signoff-commit
    name: Sign-off Commit
    entry: bash
@@ -110,13 +108,11 @@ repos:
    language: system
    verbose: true
    stages: [commit-msg]
    exclude: 'vllm/third_party/.*'
  - id: check-spdx-header
    name: Check SPDX headers
    entry: python tools/check_spdx_header.py
    language: python
    types: [python]
    exclude: 'vllm/third_party/.*'
  - id: check-filenames
    name: Check for spaces in all filenames
    entry: bash
@@ -126,7 +122,12 @@ repos:
    language: system
    always_run: true
    pass_filenames: false
-    exclude: 'vllm/third_party/.*'
+  - id: update-dockerfile-graph
    name: Update Dockerfile dependency graph
    entry: tools/update-dockerfile-graph.sh
    language: script
    files: ^docker/Dockerfile$
    pass_filenames: false
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
@@ -134,5 +135,4 @@ repos:
    language: system
    verbose: true
    pass_filenames: false
    exclude: 'vllm/third_party/.*'
  # Insert new entries above the `suggestion` entry
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -18,4 +18,4 @@ formats: []
 # Optionally declare the Python requirements required to build your docs
 python:
  install:
-    - requirements: docs/requirements-docs.txt
+    - requirements: requirements/docs.txt
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,10 +31,10 @@ set(ignoreMe "${VLLM_PYTHON_PATH}")
 set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 # Supported NVIDIA architectures.
-set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
+set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
 #
 # Supported/expected torch versions for CUDA/ROCm.
@@ -44,10 +44,10 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
 #
 # Note: the CUDA torch version is derived from pyproject.toml and various
 # requirements.txt files and should be kept consistent.  The ROCm torch
-# versions are derived from Dockerfile.rocm
+# versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
 #
 # Try to find python package with an executable that exactly matches
@@ -174,6 +174,25 @@ include(FetchContent)
 file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
 message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
 #
 # Set rocm version dev int.
 #
 if(VLLM_GPU_LANG STREQUAL "HIP")
  #
  # Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
  #
  set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
  #
  # Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
  # a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
  #
  set(CMAKE_${VLLM_GPU_LANG}_FLAGS "${CMAKE_${VLLM_GPU_LANG}_FLAGS} -Wno-unused-result")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result")
 endif()
 #
 # Define other extension targets
 #
@@ -211,10 +230,12 @@ set(VLLM_EXT_SRC
  "csrc/cache_kernels.cu"
  "csrc/attention/paged_attention_v1.cu"
  "csrc/attention/paged_attention_v2.cu"
  "csrc/attention/merge_attn_states.cu"
  "csrc/pos_encoding_kernels.cu"
  "csrc/activation_kernels.cu"
  "csrc/layernorm_kernels.cu"
  "csrc/layernorm_quant_kernels.cu"
  "csrc/cuda_view.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
  "csrc/quantization/fp8/common.cu"
@@ -222,6 +243,7 @@ set(VLLM_EXT_SRC
  "csrc/quantization/gguf/gguf_kernel.cu"
  "csrc/cuda_utils_kernels.cu"
  "csrc/prepare_inputs/advance_step.cu"
  "csrc/custom_all_reduce.cu"
  "csrc/torch_bindings.cpp")
 if(VLLM_GPU_LANG STREQUAL "CUDA")
@@ -229,7 +251,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
  # Please keep this in sync with FetchContent_Declare line below.
-  set(CUTLASS_REVISION "v3.7.0" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")
  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -247,7 +269,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
        cutlass
        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
        # Please keep this in sync with CUTLASS_REVISION line above.
-        GIT_TAG v3.7.0
+        GIT_TAG v3.8.0
        GIT_PROGRESS TRUE
        # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@@ -263,10 +285,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    "csrc/mamba/causal_conv1d/causal_conv1d.cu"
    "csrc/quantization/aqlm/gemm_kernels.cu"
    "csrc/quantization/awq/gemm_kernels.cu"
    "csrc/custom_all_reduce.cu"
    "csrc/permute_cols.cu"
    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
    "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
    "csrc/cutlass_extensions/common.cpp")
@@ -277,7 +299,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # Only build Marlin kernels if we are building for at least some compatible archs.
  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
  # are not supported by Machete yet.
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
  if (MARLIN_ARCHS)
    set(MARLIN_SRCS
       "csrc/quantization/fp8/fp8_marlin.cu"
@@ -297,43 +319,87 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                   " in CUDA target architectures")
  endif()
  # Only build AllSpark kernels if we are building for at least some compatible archs.
  cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
  if (ALLSPARK_ARCHS)
    set(ALLSPARK_SRCS
       "csrc/quantization/gptq_allspark/allspark_repack.cu"
       "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
    set_gencode_flags_for_srcs(
      SRCS "${ALLSPARK_SRCS}"
      CUDA_ARCHS "${ALLSPARK_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${ALLSPARK_SRCS}")
    message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
  else()
    message(STATUS "Not building AllSpark kernels as no compatible archs found"
                   " in CUDA target architectures")
  endif()
  set(SCALED_MM_3X_ARCHS)
  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
-  # CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
+  # CUDA 12.0 or later
-  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
    set(SRCS
-       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
+       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1")
-    message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
+    # Let scaled_mm_c2x know it doesn't need to build these arches
    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
    message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
-      message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
+      message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
                     "later if you intend on running FP8 quantized models on "
                     "Hopper.")
    else()
-      message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
+      message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found "
                     "in CUDA target architectures")
    endif()
  endif()
-    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
+  # The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
-    # build any 3x kernels
+  # CUDA 12.8 or later
-    set(SCALED_MM_3X_ARCHS)
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
    set(SRCS
      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
    )
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1")
    # Let scaled_mm_c2x know it doesn't need to build these arches
    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
    message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
  else()
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
      message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
                     "later if you intend on running FP8 quantized models on "
                     "Blackwell.")
    else()
      message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found "
                     "in CUDA target architectures")
    endif()
  endif()
  #
  # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
  # kernels for the remaining archs that are not already built for 3x.
  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
-    "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
+    "7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
  # subtract out the archs that are already built for 3x
  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
  if (SCALED_MM_2X_ARCHS)
@@ -358,17 +424,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # 2:4 Sparse Kernels
  # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
-  # require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
+  # require CUDA 12.2 or later (and only work on Hopper).
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
    set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
-    message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
+    message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
      message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
                     "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
                     "if you intend on running FP8 sparse quantized models on Hopper.")
@@ -383,7 +450,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
-    )
+      "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${FP4_ARCHS}")
@@ -396,6 +463,33 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(FP4_ARCHS)
  endif()
  #
  # CUTLASS MoE kernels
  # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
  # on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
  # to compile MoE kernels that use its output.
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
             "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1")
    message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
  else()
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
      message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
                     "not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
                     "if you intend on running FP8 quantized MoE models on Hopper.")
    else()
      message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
                     "in CUDA target architectures")
    endif()
  endif()
  #
  # Machete kernels
@@ -477,6 +571,7 @@ define_gpu_extension_target(
  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
  ARCHITECTURES ${VLLM_GPU_ARCHES}
  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
  INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
  USE_SABI 3
  WITH_SOABI)
@@ -495,12 +590,24 @@ set(VLLM_MOE_EXT_SRC
  "csrc/moe/moe_align_sum_kernels.cu"
  "csrc/moe/topk_softmax_kernels.cu")
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
 endif()
 set_gencode_flags_for_srcs(
  SRCS "${VLLM_MOE_EXT_SRC}"
  CUDA_ARCHS "${CUDA_ARCHS}")
 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
+  set(VLLM_MOE_WNA16_SRC
    "csrc/moe/moe_wna16.cu")
  set_gencode_flags_for_srcs(
    SRCS "${VLLM_MOE_WNA16_SRC}"
    CUDA_ARCHS "${CUDA_ARCHS}")
  list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
  if (MARLIN_MOE_ARCHS)
    set(MARLIN_MOE_SRC
        "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
@@ -554,77 +661,8 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
    WITH_SOABI)
 endif()
-# vllm-flash-attn currently only supported on CUDA
+# For CUDA we also build and ship some external projects.
-if (NOT VLLM_GPU_LANG STREQUAL "CUDA")
+if (VLLM_GPU_LANG STREQUAL "CUDA")
-  return()
+    include(cmake/external_projects/flashmla.cmake)
    include(cmake/external_projects/vllm_flash_attn.cmake)
 endif ()
 # vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
 # arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
 # arches in the CUDA case (and instead set the gencodes on a per file basis)
 # we need to manually set VLLM_GPU_ARCHES here.
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  foreach(_ARCH ${CUDA_ARCHS})
    string(REPLACE "." "" _ARCH "${_ARCH}")
    list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
  endforeach()
 endif()
 #
 # Build vLLM flash attention from source
 #
 # IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
 # Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
 # They should be identical but if they aren't, this is a massive footgun.
 #
 # The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
 # To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).
 # If no component is specified, vllm-flash-attn is still installed.
 # If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
 # This is to enable local development of vllm-flash-attn within vLLM.
 # It can be set as an environment variable or passed as a cmake argument.
 # The environment variable takes precedence.
 if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
  set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
 endif()
 if(VLLM_FLASH_ATTN_SRC_DIR)
  FetchContent_Declare(
          vllm-flash-attn SOURCE_DIR 
          ${VLLM_FLASH_ATTN_SRC_DIR}
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
  )
 else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
          GIT_TAG 720c94869cf2e0ff5a706e9c7f1dce0939686ade
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
  )
 endif()
 # Fetch the vllm-flash-attn library
 FetchContent_MakeAvailable(vllm-flash-attn)
 message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
 # Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
 # case only one is built, in the case both are built redundant work is done)
 install(
  DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
  DESTINATION vllm_flash_attn
  COMPONENT _vllm_fa2_C
  FILES_MATCHING PATTERN "*.py"
 )
 install(
  DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
  DESTINATION vllm_flash_attn
  COMPONENT _vllm_fa3_C
  FILES_MATCHING PATTERN "*.py"
 )
 # Nothing after vllm-flash-attn, see comment about macros above
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -1,69 +0,0 @@
 # This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
 FROM ubuntu:22.04 AS cpu-test-1
 ENV CCACHE_DIR=/root/.cache/ccache
 ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
 RUN --mount=type=cache,target=/var/cache/apt \
    apt-get update -y \
    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 # https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
 # intel-openmp provides additional performance improvement vs. openmp
 # tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install intel-openmp==2025.0.1
 ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
 RUN echo 'ulimit -c 0' >> ~/.bashrc
 RUN pip install intel_extension_for_pytorch==2.5.0
 WORKDIR /workspace
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
    pip install --upgrade pip && \
    pip install -r requirements-build.txt
 FROM cpu-test-1 AS build
 WORKDIR /workspace/vllm
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
    pip install -v -r requirements-cpu.txt
 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
 ARG VLLM_CPU_DISABLE_AVX512
 ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=bind,source=.git,target=.git \
    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
    pip install dist/*.whl && \
    rm -rf dist
 WORKDIR /workspace/
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -e tests/vllm_test_utils
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -1,29 +0,0 @@
 # The vLLM Dockerfile is used to construct vLLM image that can be directly used
 # to run the OpenAI compatible server.
 FROM ubuntu:22.04 AS dev
 RUN apt-get update -y && \
    apt-get install -y \
        git python3-pip \
        ffmpeg libsm6 libxext6 libgl1
 WORKDIR /workspace
 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 RUN python3 -m pip install -U pip
 # install build requirements
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
 # build vLLM with OpenVINO backend
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
 COPY examples/ /workspace/examples
 COPY benchmarks/ /workspace/benchmarks
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 CMD ["/bin/bash"]
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -1,37 +0,0 @@
 FROM mambaorg/micromamba
 ARG MAMBA_DOCKERFILE_ACTIVATE=1
 USER root
 ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
 RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
 # Some packages in requirements-cpu are installed here
 # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
 # Currently these may not be available for venv or pip directly
 RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
 COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
 RUN --mount=type=cache,target=/root/.cache/pip  \
    RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
        -r requirements-cpu.txt \
        xformers uvloop==0.20.0
 RUN --mount=type=bind,source=.git,target=.git \
    VLLM_TARGET_DEVICE=cpu python3 setup.py install
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 WORKDIR /workspace/
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -1,69 +0,0 @@
 FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
    chmod 644 /usr/share/keyrings/intel-graphics.gpg
 RUN apt-get update -y && \
    apt-get install -y --no-install-recommends --fix-missing \
    curl \
    ffmpeg \
    git \
    libsndfile1 \
    libsm6 \
    libxext6 \
    libgl1 \
    lsb-release \
    numactl \
    python3 \
    python3-dev \
    python3-pip \
    # vim \
    wget
 WORKDIR /workspace/vllm
 COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt
 COPY requirements-common.txt /workspace/vllm/requirements-common.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install --no-cache-dir \
    -r requirements-xpu.txt
 RUN git clone https://github.com/intel/pti-gpu && \
    cd pti-gpu/sdk && \
    git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \
    mkdir build && \
    cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
    make -j && \
    cmake --install . --config Release --prefix "/usr/local"
 ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
 COPY . .
 ARG GIT_REPO_CHECK
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
 ENV VLLM_TARGET_DEVICE=xpu
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,source=.git,target=.git \
    python3 setup.py install
 CMD ["/bin/bash"]
 FROM vllm-base AS vllm-openai
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install accelerate hf_transfer 'modelscope!=1.15.0'
 ENV VLLM_USAGE_SOURCE production-docker-image \
    TRITON_XPU_PROFILE 1
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,9 +1,9 @@
 include LICENSE
-include requirements-common.txt
+include requirements/common.txt
-include requirements-cuda.txt
+include requirements/cuda.txt
-include requirements-rocm.txt
+include requirements/rocm.txt
-include requirements-neuron.txt
+include requirements/neuron.txt
-include requirements-cpu.txt
+include requirements/cpu.txt
 include CMakeLists.txt
 recursive-include cmake *
--- a/README.md
+++ b/README.md
@@ -10,20 +10,24 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>
 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>
 ---
 We are excited to invite you to our Menlo Park meetup with Meta, evening of Thursday, February 27! Meta engineers will discuss the improvements on top of vLLM, and vLLM contributors will share updates from the v0.7.x series of releases. [Register Now](https://lu.ma/h7g3kuj9)
 ---
 *Latest News* 🔥
-
+- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
 - [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
 - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
 - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
 <details>
 <summary>Previous News</summary>
 - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
 - [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
 - [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
@@ -37,8 +41,9 @@ We are excited to invite you to our Menlo Park meetup with Meta, evening of Thur
 - [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
 - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
---
+</details>
 ---
 ## About
 vLLM is a fast and easy-to-use library for LLM inference and serving.
@@ -86,14 +91,14 @@ pip install vllm
 ```
 Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html)
+- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html)
 - [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
 - [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
 ## Contributing
 We welcome and value any contributions and collaborations.
-Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
+Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved.
 ## Sponsors
@@ -116,6 +121,7 @@ Compute Resources:
 - Databricks
 - DeepInfra
 - Google Cloud
 - Intel
 - Lambda Lab
 - Nebius
 - Novita AI
@@ -146,10 +152,11 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 ## Contact Us
- For technical questions and feature requests, please use Github issues or discussions.
+- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
- For discussing with fellow users and coordinating contributions and development, please use Slack.
+- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
- For security disclosures, please use Github's security advisory feature.
+- coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
- For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
+- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
 - For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
 ## Media Kit
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -0,0 +1,54 @@
 # Releasing vLLM
 vLLM releases offer a reliable version of the code base, packaged into a binary format that can be conveniently accessed via PyPI. These releases also serve as key milestones for the development team to communicate with the community about newly available features, improvements, and upcoming changes that could affect users, including potential breaking changes.
 ## Release Versioning
 vLLM uses a “right-shifted” versioning scheme where a new patch release is out every 2 weeks. And patch releases contain features and bug fixes (as opposed to semver where patch release contains only backwards-compatible bug fixes). When critical fixes need to be made, special release post1 is released.
 * _major_ major architectural milestone and when incompatible API changes are made, similar to PyTorch 2.0.
 * _minor_ major features
 * _patch_ features and backwards-compatible bug fixes
 * _post1_ or _patch-1_ backwards-compatible bug fixes, either explicit or implicit post release
 ## Release Cadence
 Patch release is released on bi-weekly basis. Post release 1-3 days after patch release and uses same branch as patch release.
 Following is the release cadence for year 2025. All future release dates below are tentative. Please note: Post releases are optional.
 | Release Date | Patch release versions | Post Release versions |
 | --- | --- | --- |
 | Jan 2025 | 0.7.0 | --- |
 | Feb 2025 | 0.7.1, 0.7.2, 0.7.3  | --- |
 | Mar 2025 | 0.7.4, 0.7.5 | --- |
 | Apr 2025 | 0.7.6, 0.7.7 | --- |
 | May 2025 | 0.7.8, 0.7.9 | --- |
 | Jun 2025 | 0.7.10, 0.7.11 | --- |
 | Jul 2025 | 0.7.12, 0.7.13 | --- |
 | Aug 2025 | 0.7.14, 0.7.15 | --- |
 | Sep 2025 | 0.7.16, 0.7.17 | --- |
 | Oct 2025 | 0.7.18, 0.7.19 | --- |
 | Nov 2025 | 0.7.20, 0.7.21 | --- |
 | Dec 2025 | 0.7.22, 0.7.23 | --- |
 ## Release branch
 Each release is built from a dedicated release branch.
 * For _major_, _minor_, _patch_ releases, the release branch cut is performed 1-2 days before release is live.
 * For post releases, previously cut release branch is reused
 * Release builds are triggered via push to RC tag like vX.Y.Z-rc1 . This enables us to build and test multiple RCs for each release.
 * Final tag : vX.Y.Z does not trigger the build but used for Release notes and assets.
 * After branch cut is created we monitor the main branch for any reverts and apply these reverts to a release branch.
 ## Release Cherry-Pick Criteria
 After branch cut, we approach finalizing the release branch with clear criteria on what cherry picks are allowed in. Note: a cherry pick is a process to land a PR in the release branch after branch cut. These are typically limited to ensure that the team has sufficient time to complete a thorough round of testing on a stable code base.
 * Regression fixes - that address functional/performance regression against the most recent release (e.g. 0.7.0 for 0.7.1 release)
 * Critical fixes - critical fixes for severe issue such as silent incorrectness, backwards compatibility, crashes, deadlocks, (large) memory leaks
 * Fixes to new features introduced in the most recent release (e.g. 0.7.0 for 0.7.1 release)
 * Documentation improvements
 * Release branch specific changes (e.g. change version identifiers or CI fixes)
 Please note: **No feature work allowed for cherry picks**. All PRs that are considered for cherry-picks need to be merged on trunk, the only exception are Release branch specific changes.
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,29 +1,343 @@
 # Benchmarking vLLM
-## Downloading the ShareGPT dataset
+This README guides you through running benchmark tests with the extensive
 datasets supported on vLLM. It’s a living document, updated as new features and datasets
 become available.
-You can download the dataset by running:
+## Dataset Overview
 <table style="width:100%; border-collapse: collapse;">
  <thead>
    <tr>
      <th style="width:15%; text-align: left;">Dataset</th>
      <th style="width:10%; text-align: center;">Online</th>
      <th style="width:10%; text-align: center;">Offline</th>
      <th style="width:65%; text-align: left;">Data Path</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td><strong>ShareGPT</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
      <td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
    </tr>
    <tr>
      <td><strong>BurstGPT</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
      <td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td>
    </tr>
    <tr>
      <td><strong>Sonnet</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
      <td>Local file: <code>benchmarks/sonnet.txt</code></td>
    </tr>
    <tr>
      <td><strong>Random</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
      <td><code>synthetic</code></td>
    </tr>
    <tr>
      <td><strong>HuggingFace-VisionArena</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
      <td><code>lmarena-ai/VisionArena-Chat</code></td>
    </tr>
    <tr>
      <td><strong>HuggingFace-InstructCoder</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
      <td><code>likaixin/InstructCoder</code></td>
    </tr>
      <tr>
      <td><strong>HuggingFace-AIMO</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
      <td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
    </tr>
    <tr>
      <td><strong>HuggingFace-Other</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
      <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
    </tr>
  </tbody>
 </table>
 ✅: supported
 🟡: Partial support
 🚧: to be supported
 **Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
 ---
 ## Example - Online Benchmark
 First start serving your model
 ```bash
-wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
 ```
-## Downloading the ShareGPT4V dataset
+Then run the benchmarking script
 The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
 will ignore a datapoint if the referred image is missing.
 ```bash
-wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
+# download dataset
-mkdir coco -p
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
+python3 vllm/benchmarks/benchmark_serving.py \
-unzip coco/train2017.zip -d coco/
+  --backend vllm \
  --model NousResearch/Hermes-3-Llama-3.1-8B \
  --endpoint /v1/completions \
  --dataset-name sharegpt \
  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
  --num-prompts 10
 ```
-# Downloading the BurstGPT dataset
+If successful, you will see the following output
-You can download the BurstGPT v1.1 dataset by running:
+```
 ============ Serving Benchmark Result ============
 Successful requests:                     10        
 Benchmark duration (s):                  5.78      
 Total input tokens:                      1369      
 Total generated tokens:                  2212      
 Request throughput (req/s):              1.73      
 Output token throughput (tok/s):         382.89    
 Total Token throughput (tok/s):          619.85    
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          71.54     
 Median TTFT (ms):                        73.88     
 P99 TTFT (ms):                           79.49     
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          7.91      
 Median TPOT (ms):                        7.96      
 P99 TPOT (ms):                           8.03      
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           7.74      
 Median ITL (ms):                         7.70      
 P99 ITL (ms):                            8.39      
 ==================================================
 ```
 ### VisionArena Benchmark for Vision Language Models
 ```bash
-wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv
+# need a model with vision capability here
 vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```
 ```bash
 python3 vllm/benchmarks/benchmark_serving.py \
  --backend openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
  --dataset-name hf \
  --dataset-path lmarena-ai/VisionArena-Chat \
  --hf-split train \
  --num-prompts 1000
 ```
 ### InstructCoder Benchmark with Speculative Decoding
 ``` bash
 VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
    --speculative-model "[ngram]" \
    --ngram_prompt_lookup_min 2 \
    --ngram-prompt-lookup-max 5 \
    --num_speculative_tokens 5
 ```
 ``` bash
 python3 benchmarks/benchmark_serving.py \
    --model meta-llama/Meta-Llama-3-8B-Instruct \
    --dataset-name hf \
    --dataset-path likaixin/InstructCoder \
    --num-prompts 2048
 ```
 ### Other HuggingFaceDataset Examples
 ```bash
 vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```
 **`lmms-lab/LLaVA-OneVision-Data`**
 ```bash
 python3 vllm/benchmarks/benchmark_serving.py \
  --backend openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
  --dataset-name hf \
  --dataset-path lmms-lab/LLaVA-OneVision-Data \
  --hf-split train \
  --hf-subset "chart2text(cauldron)" \
  --num-prompts 10
 ```
 **`Aeala/ShareGPT_Vicuna_unfiltered`**
 ```bash
 python3 vllm/benchmarks/benchmark_serving.py \
  --backend openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
  --dataset-name hf \
  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
  --hf-split train \
  --num-prompts 10
 ```
 **`AI-MO/aimo-validation-aime`**
 ``` bash
 python3 vllm/benchmarks/benchmark_serving.py \
    --model Qwen/QwQ-32B \
    --dataset-name hf \
    --dataset-path AI-MO/aimo-validation-aime \
    --num-prompts 10 \
    --seed 42
 ```
 ### Running With Sampling Parameters
 When using OpenAI-compatible backends such as `vllm`, optional sampling
 parameters can be specified. Example client command:
 ```bash
 python3 vllm/benchmarks/benchmark_serving.py \
  --backend vllm \
  --model NousResearch/Hermes-3-Llama-3.1-8B \
  --endpoint /v1/completions \
  --dataset-name sharegpt \
  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
  --top-k 10 \
  --top-p 0.9 \
  --temperature 0.5 \
  --num-prompts 10
 ```
 ---
 ## Example - Offline Throughput Benchmark
 ```bash
 python3 vllm/benchmarks/benchmark_throughput.py \
  --model NousResearch/Hermes-3-Llama-3.1-8B \
  --dataset-name sonnet \
  --dataset-path vllm/benchmarks/sonnet.txt \
  --num-prompts 10
 ```
 If successful, you will see the following output
 ```
 Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
 Total num prompt tokens:  5014
 Total num output tokens:  1500
 ```
 ### VisionArena Benchmark for Vision Language Models
 ``` bash
 python3 vllm/benchmarks/benchmark_throughput.py \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --backend vllm-chat \
  --dataset-name hf \
  --dataset-path lmarena-ai/VisionArena-Chat \
  --num-prompts 1000 \
  --hf-split train
 ```
 The `num prompt tokens` now includes image token counts
 ```
 Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
 Total num prompt tokens:  14527
 Total num output tokens:  1280
 ```
 ### InstructCoder Benchmark with Speculative Decoding
 ``` bash
 VLLM_WORKER_MULTIPROC_METHOD=spawn \
 VLLM_USE_V1=1 \
 python3 vllm/benchmarks/benchmark_throughput.py \
    --dataset-name=hf \
    --dataset-path=likaixin/InstructCoder \
    --model=meta-llama/Meta-Llama-3-8B-Instruct \
    --input-len=1000 \
    --output-len=100 \
    --num-prompts=2048 \
    --async-engine \
    --speculative-model="[ngram]" \
    --ngram_prompt_lookup_min=2 \
    --ngram-prompt-lookup-max=5 \
    --num_speculative_tokens=5
 ```
 ```
 Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
 Total num prompt tokens:  261136
 Total num output tokens:  204800
 ```
 ### Other HuggingFaceDataset Examples
 **`lmms-lab/LLaVA-OneVision-Data`**
 ```bash
 python3 vllm/benchmarks/benchmark_throughput.py \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --backend vllm-chat \
  --dataset-name hf \
  --dataset-path lmms-lab/LLaVA-OneVision-Data \
  --hf-split train \
  --hf-subset "chart2text(cauldron)" \
  --num-prompts 10
 ```
 **`Aeala/ShareGPT_Vicuna_unfiltered`**
 ```bash
 python3 vllm/benchmarks/benchmark_throughput.py \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --backend vllm-chat \
  --dataset-name hf \
  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
  --hf-split train \
  --num-prompts 10
 ```
 **`AI-MO/aimo-validation-aime`**
 ```bash
 python3 benchmarks/benchmark_throughput.py \
  --model Qwen/QwQ-32B \
  --backend vllm \
  --dataset-name hf \
  --dataset-path AI-MO/aimo-validation-aime \
  --hf-split train \
  --num-prompts 10
 ```
 ### Benchmark with LoRA Adapters
 ``` bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 python3 vllm/benchmarks/benchmark_throughput.py \
  --model meta-llama/Llama-2-7b-hf \
  --backend vllm \
  --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
  --dataset_name sharegpt \
  --num-prompts 10 \
  --max-loras 2 \
  --max-lora-rank 8 \
  --enable-lora \
  --lora-path yard1/llama-2-7b-sql-lora-test
  ```
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -6,7 +6,7 @@ import sys
 import time
 import traceback
 from dataclasses import dataclass, field
-from typing import List, Optional, Union
+from typing import Optional, Union
 import aiohttp
 import huggingface_hub.constants
@@ -14,6 +14,9 @@ from tqdm.asyncio import tqdm
 from transformers import (AutoTokenizer, PreTrainedTokenizer,
                          PreTrainedTokenizerFast)
 # NOTE(simon): do not import vLLM here so the benchmark script
 # can run without vLLM installed.
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
@@ -25,7 +28,6 @@ class RequestFuncInput:
    output_len: int
    model: str
    model_name: Optional[str] = None
    best_of: int = 1
    logprobs: Optional[int] = None
    extra_body: Optional[dict] = None
    multi_modal_content: Optional[dict] = None
@@ -39,8 +41,8 @@ class RequestFuncOutput:
    latency: float = 0.0
    output_tokens: int = 0
    ttft: float = 0.0  # Time to first token
-    itl: List[float] = field(
+    itl: list[float] = field(
-        default_factory=list)  # List of inter-token latencies
+        default_factory=list)  # list of inter-token latencies
    tpot: float = 0.0  # avg next-token latencies
    prompt_len: int = 0
    error: str = ""
@@ -56,13 +58,12 @@ async def async_request_tgi(
    async with aiohttp.ClientSession(trust_env=True,
                                     timeout=AIOHTTP_TIMEOUT) as session:
        params = {
            "best_of": request_func_input.best_of,
            "max_new_tokens": request_func_input.output_len,
            "do_sample": True,
            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
            "truncate": request_func_input.prompt_len,
-            # TGI does not accept ignore_eos flag.
+            "ignore_eos_token": request_func_input.ignore_eos,
        }
        payload = {
            "inputs": request_func_input.prompt,
@@ -70,6 +71,10 @@ async def async_request_tgi(
        }
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
        if request_func_input.ignore_eos:
            output.output_tokens = request_func_input.output_len
        else:
            output.output_tokens = None
        ttft = 0.0
        st = time.perf_counter()
@@ -128,7 +133,6 @@ async def async_request_trt_llm(
    async with aiohttp.ClientSession(trust_env=True,
                                     timeout=AIOHTTP_TIMEOUT) as session:
        assert request_func_input.best_of == 1
        payload = {
            "accumulate_tokens": True,
            "text_input": request_func_input.prompt,
@@ -193,7 +197,6 @@ async def async_request_deepspeed_mii(
 ) -> RequestFuncOutput:
    async with aiohttp.ClientSession(trust_env=True,
                                     timeout=AIOHTTP_TIMEOUT) as session:
        assert request_func_input.best_of == 1
        payload = {
            "prompt": request_func_input.prompt,
@@ -216,7 +219,15 @@ async def async_request_deepspeed_mii(
                if response.status == 200:
                    parsed_resp = await response.json()
                    output.latency = time.perf_counter() - st
-                    output.generated_text = parsed_resp["text"][0]
+                    if "choices" in parsed_resp:
                        output.generated_text = parsed_resp["choices"][0][
                            "text"]
                    elif "text" in parsed_resp:
                        output.generated_text = parsed_resp["text"][0]
                    else:
                        output.error = ("Unexpected response format: "
                                        "neither 'choices' nor 'text' found")
                        output.success = False
                    output.success = True
                else:
                    output.error = response.reason or ""
@@ -247,7 +258,6 @@ async def async_request_openai_completions(
                if request_func_input.model_name else request_func_input.model,
            "prompt": request_func_input.prompt,
            "temperature": 0.0,
            "best_of": request_func_input.best_of,
            "max_tokens": request_func_input.output_len,
            "logprobs": request_func_input.logprobs,
            "stream": True,
@@ -336,7 +346,7 @@ async def async_request_openai_chat_completions(
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(
-        "chat/completions"
+        ("chat/completions", "profile")
    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
    async with aiohttp.ClientSession(trust_env=True,
@@ -430,12 +440,17 @@ def get_model(pretrained_model_name_or_path: str) -> str:
    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
        from modelscope import snapshot_download
-        model_path = snapshot_download(
+        from vllm.model_executor.model_loader.weight_utils import get_lock
            model_id=pretrained_model_name_or_path,
            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
            ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
-        return model_path
+        # Use file lock to prevent multiple processes from
        # downloading the same model weights at the same time.
        with get_lock(pretrained_model_name_or_path):
            model_path = snapshot_download(
                model_id=pretrained_model_name_or_path,
                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
                ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
            return model_path
    return pretrained_model_name_or_path
@@ -482,3 +497,9 @@ ASYNC_REQUEST_FUNCS = {
    "scalellm": async_request_openai_completions,
    "sglang": async_request_openai_completions,
 }
 OPENAI_COMPATIBLE_BACKENDS = [
    k for k, v in ASYNC_REQUEST_FUNCS.items()
    if v in (async_request_openai_completions,
             async_request_openai_chat_completions)
 ]
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -0,0 +1,817 @@
 # SPDX-License-Identifier: Apache-2.0
 """
 This module defines a framework for sampling benchmark requests from various
 datasets. Each dataset subclass of BenchmarkDataset must implement sample
 generation. Supported dataset types include:
  - ShareGPT
  - Random (synthetic)
  - Sonnet
  - BurstGPT
  - HuggingFace
  - VisionArena
 TODO: Implement CustomDataset to parse a JSON file and convert its contents into
 SampleRequest instances, similar to the approach used in ShareGPT.
 """
 import base64
 import io
 import json
 import logging
 import random
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass
 from functools import cache
 from io import BytesIO
 from typing import Any, Callable, Optional, Union
 import numpy as np
 import pandas as pd
 from datasets import load_dataset
 from PIL import Image
 from transformers import PreTrainedTokenizerBase
 from vllm.lora.request import LoRARequest
 from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
 logger = logging.getLogger(__name__)
 # -----------------------------------------------------------------------------
 # Data Classes
 # -----------------------------------------------------------------------------
@dataclass
 class SampleRequest:
    """
    Represents a single inference request for benchmarking.
    """
    prompt: Union[str, Any]
    prompt_len: int
    expected_output_len: int
    multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
    lora_request: Optional[LoRARequest] = None
 # -----------------------------------------------------------------------------
 # Benchmark Dataset Base Class
 # -----------------------------------------------------------------------------
 class BenchmarkDataset(ABC):
    DEFAULT_SEED = 0
    def __init__(
        self,
        dataset_path: Optional[str] = None,
        random_seed: int = DEFAULT_SEED,
    ) -> None:
        """
        Initialize the BenchmarkDataset with an optional dataset path and random
        seed.  Args:
            dataset_path (Optional[str]): Path to the dataset. If None, it
            indicates that a default or random dataset might be used.
            random_seed (int): Seed value for reproducible shuffling or
            sampling. Defaults to DEFAULT_SEED.
        """
        self.dataset_path = dataset_path
        # Set the random seed, ensuring that a None value is replaced with the
        # default seed.
        self.random_seed = (random_seed
                            if random_seed is not None else self.DEFAULT_SEED)
        self.data = None
    def apply_multimodal_chat_transformation(
            self,
            prompt: str,
            mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
        """
        Transform a prompt and optional multimodal content into a chat format.
        This method is used for chat models that expect a specific conversation
        format.
        """
        content = [{"text": prompt, "type": "text"}]
        if mm_content is not None:
            content.append(mm_content)
        return [{"role": "user", "content": content}]
    def load_data(self) -> None:
        """
        Load data from the dataset path into self.data.
        This method must be overridden by subclasses since the method to load
        data will vary depending on the dataset format and source.
        Raises:
            NotImplementedError: If a subclass does not implement this method.
        """
        # TODO (jenniferzhao): add support for downloading data
        raise NotImplementedError(
            "load_data must be implemented in subclasses.")
    def get_random_lora_request(
        self,
        tokenizer: PreTrainedTokenizerBase,
        max_loras: Optional[int] = None,
        lora_path: Optional[str] = None,
    ) -> tuple[Optional[LoRARequest], AnyTokenizer]:
        """
        Optionally select a random LoRA request and return its associated
        tokenizer.
        This method is used when LoRA parameters are provided.  It randomly
        selects a LoRA based on max_loras and retrieves a cached tokenizer for
        that LoRA if available. Otherwise, it returns the base tokenizer.
        Args:
            tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
            LoRA is selected.  max_loras (Optional[int]): The maximum number of
            LoRAs available. If None, LoRA is not used.  lora_path
            (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA
            is not used.
        Returns:
            tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first
            element is a LoRARequest (or None if not applicable) and the second
            element is the tokenizer associated with the LoRA request (or the
            base tokenizer).
        """
        if max_loras is None or lora_path is None:
            return None, tokenizer
        # Generate a random LoRA ID in the range [1, max_loras].
        lora_id = random.randint(1, max_loras)
        lora_request = LoRARequest(
            lora_name=str(lora_id),
            lora_int_id=lora_id,
            lora_path=lora_path_on_disk(lora_path),
        )
        if lora_id not in lora_tokenizer_cache:
            lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
        # Return lora_request and the cached tokenizer if available; otherwise,
        # return the base tokenizer
        return lora_request, lora_tokenizer_cache[lora_id] or tokenizer
    @abstractmethod
    def sample(self, tokenizer: PreTrainedTokenizerBase,
               num_requests: int) -> list[SampleRequest]:
        """
        Abstract method to generate sample requests from the dataset.
        Subclasses must override this method to implement dataset-specific logic
        for generating a list of SampleRequest objects.
        Args:
            tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
             for processing the dataset's text.
            num_requests (int): The number of sample requests to generate.
        Returns:
            list[SampleRequest]: A list of sample requests generated from the
            dataset.
        """
        raise NotImplementedError("sample must be implemented in subclasses.")
    def maybe_oversample_requests(self, requests: list[SampleRequest],
                                  num_requests: int) -> None:
        """
        Oversamples the list of requests if its size is less than the desired
        number.
        Args:
            requests (List[SampleRequest]): The current list of sampled
            requests.  num_requests (int): The target number of requests.
        """
        if len(requests) < num_requests:
            random.seed(self.random_seed)
            additional = random.choices(requests,
                                        k=num_requests - len(requests))
            requests.extend(additional)
            logger.info("Oversampled requests to reach %d total samples.",
                        num_requests)
 # -----------------------------------------------------------------------------
 # Utility Functions and Global Caches
 # -----------------------------------------------------------------------------
 def is_valid_sequence(
    prompt_len: int,
    output_len: int,
    min_len: int = 4,
    max_prompt_len: int = 1024,
    max_total_len: int = 2048,
    skip_min_output_len_check: bool = False,
 ) -> bool:
    """
    Validate a sequence based on prompt and output lengths.
    Default pruning criteria are copied from the original `sample_hf_requests`
    and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
    from `sample_requests` in benchmark_throughput.py.
    """
    # Check for invalid conditions
    prompt_too_short = prompt_len < min_len
    output_too_short = (not skip_min_output_len_check) and (output_len
                                                            < min_len)
    prompt_too_long = prompt_len > max_prompt_len
    combined_too_long = (prompt_len + output_len) > max_total_len
    # Return True if none of the invalid conditions are met
    return not (prompt_too_short or output_too_short or prompt_too_long
                or combined_too_long)
@cache
 def lora_path_on_disk(lora_path: str) -> str:
    return get_adapter_absolute_path(lora_path)
 # Global cache for LoRA tokenizers.
 lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
 def process_image(image: Any) -> Mapping[str, Any]:
    """
    Process a single image input and return a multimedia content dictionary.
    Supports three input types:
    1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
       containing raw image data.  - Loads the bytes as a PIL.Image.Image.
    2. PIL.Image.Image input: - Converts the image to RGB.  - Saves the image as
       a JPEG in memory.  - Encodes the JPEG data as a base64 string.  - Returns
       a dictionary with the image as a base64 data URL.
    3. String input: - Treats the string as a URL or local file path.  -
       Prepends "file://" if the string doesn't start with "http://" or
       "file://".  - Returns a dictionary with the image URL.
    Raises:
        ValueError: If the input is not a supported type.
    """
    if isinstance(image, dict) and 'bytes' in image:
        image = Image.open(BytesIO(image['bytes']))
    if isinstance(image, Image.Image):
        image = image.convert("RGB")
        with io.BytesIO() as image_data:
            image.save(image_data, format="JPEG")
            image_base64 = base64.b64encode(
                image_data.getvalue()).decode("utf-8")
        return {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{image_base64}"
            },
        }
    if isinstance(image, str):
        image_url = (image if image.startswith(
            ("http://", "file://")) else f"file://{image}")
        return {"type": "image_url", "image_url": {"url": image_url}}
    raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
                     " or str or dictionary with raw image bytes.")
 # -----------------------------------------------------------------------------
 # Random Dataset Implementation (Synthetic Data)
 # -----------------------------------------------------------------------------
 class RandomDataset(BenchmarkDataset):
    # Default values copied from benchmark_serving.py for the random dataset.
    DEFAULT_PREFIX_LEN = 0
    DEFAULT_RANGE_RATIO = 0.0
    DEFAULT_INPUT_LEN = 1024
    DEFAULT_OUTPUT_LEN = 128
    def __init__(
        self,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
    def sample(
        self,
        tokenizer: PreTrainedTokenizerBase,
        num_requests: int,
        prefix_len: int = DEFAULT_PREFIX_LEN,
        range_ratio: float = DEFAULT_RANGE_RATIO,
        input_len: int = DEFAULT_INPUT_LEN,
        output_len: int = DEFAULT_OUTPUT_LEN,
        **kwargs,
    ) -> list[SampleRequest]:
        # Enforce range_ratio < 1
        assert range_ratio < 1.0, (
            "random_range_ratio must be < 1.0 to ensure a valid sampling range"
        )
        vocab_size = tokenizer.vocab_size
        prefix_token_ids = (np.random.randint(
            0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
        # New sampling logic: [X * (1 - b), X * (1 + b)]
        input_low = int(input_len * (1 - range_ratio))
        input_high = int(input_len * (1 + range_ratio))
        output_low = int(output_len * (1 - range_ratio))
        output_high = int(output_len * (1 + range_ratio))
        # Add logging for debugging
        logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
        logger.info("Sampling output_len from [%s, %s]", output_low,
                    output_high)
        input_lens = np.random.randint(input_low,
                                       input_high + 1,
                                       size=num_requests)
        output_lens = np.random.randint(output_low,
                                        output_high + 1,
                                        size=num_requests)
        offsets = np.random.randint(0, vocab_size, size=num_requests)
        requests = []
        for i in range(num_requests):
            inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) %
                         vocab_size).tolist()
            token_sequence = prefix_token_ids + inner_seq
            prompt = tokenizer.decode(token_sequence)
            total_input_len = prefix_len + int(input_lens[i])
            requests.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=total_input_len,
                    expected_output_len=int(output_lens[i]),
                ))
        return requests
 # -----------------------------------------------------------------------------
 # ShareGPT Dataset Implementation
 # -----------------------------------------------------------------------------
 class ShareGPTDataset(BenchmarkDataset):
    """
    Implements the ShareGPT dataset.  Loads data from a JSON file and generates
    sample requests based on conversation turns.
    """
    def __init__(self, **kwargs) -> None:
        super().__init__(**kwargs)
        self.load_data()
    def load_data(self) -> None:
        if self.dataset_path is None:
            raise ValueError("dataset_path must be provided for loading data.")
        with open(self.dataset_path, encoding="utf-8") as f:
            self.data = json.load(f)
        # Filter entries with at least two conversation turns.
        self.data = [
            entry for entry in self.data
            if "conversations" in entry and len(entry["conversations"]) >= 2
        ]
        random.seed(self.random_seed)
        random.shuffle(self.data)
    def sample(
        self,
        tokenizer: PreTrainedTokenizerBase,
        num_requests: int,
        lora_path: Optional[str] = None,
        max_loras: Optional[int] = None,
        output_len: Optional[int] = None,
        enable_multimodal_chat: bool = False,
        **kwargs,
    ) -> list:
        samples: list = []
        for entry in self.data:
            if len(samples) >= num_requests:
                break
            prompt, completion = (
                entry["conversations"][0]["value"],
                entry["conversations"][1]["value"],
            )
            lora_request, tokenizer = self.get_random_lora_request(
                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
            prompt_ids = tokenizer(prompt).input_ids
            completion_ids = tokenizer(completion).input_ids
            prompt_len = len(prompt_ids)
            new_output_len = (len(completion_ids)
                              if output_len is None else output_len)
            if not is_valid_sequence(prompt_len,
                                     new_output_len,
                                     skip_min_output_len_check=output_len
                                     is not None):
                continue
            if enable_multimodal_chat:
                prompt = self.apply_multimodal_chat_transformation(
                    prompt, None)
            samples.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=new_output_len,
                    lora_request=lora_request,
                ))
        self.maybe_oversample_requests(samples, num_requests)
        return samples
 # -----------------------------------------------------------------------------
 # Sonnet Dataset Implementation
 # -----------------------------------------------------------------------------
 class SonnetDataset(BenchmarkDataset):
    """
    Simplified implementation of the Sonnet dataset.  Loads poem lines from a
    text file and generates sample requests.  Default values here copied from
    `benchmark_serving.py` for the sonnet dataset.
    """
    DEFAULT_PREFIX_LEN = 200
    DEFAULT_INPUT_LEN = 550
    DEFAULT_OUTPUT_LEN = 150
    def __init__(
        self,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self.load_data()
    def load_data(self) -> None:
        if not self.dataset_path:
            raise ValueError("dataset_path must be provided.")
        with open(self.dataset_path, encoding="utf-8") as f:
            self.data = f.readlines()
    def sample(
        self,
        tokenizer,
        num_requests: int,
        prefix_len: int = DEFAULT_PREFIX_LEN,
        input_len: int = DEFAULT_INPUT_LEN,
        output_len: int = DEFAULT_OUTPUT_LEN,
        return_prompt_formatted: bool = False,
        **kwargs,
    ) -> list:
        # Calculate average token length for a poem line.
        tokenized_lines = [tokenizer(line).input_ids for line in self.data]
        avg_len = sum(len(tokens)
                      for tokens in tokenized_lines) / len(tokenized_lines)
        # Build the base prompt.
        base_prompt = "Pick as many lines as you can from these poem lines:\n"
        base_msg = [{"role": "user", "content": base_prompt}]
        base_fmt = tokenizer.apply_chat_template(base_msg,
                                                 add_generation_prompt=True,
                                                 tokenize=False)
        base_offset = len(tokenizer(base_fmt).input_ids)
        if input_len <= base_offset:
            raise ValueError(
                f"'input_len' must be higher than the base prompt length "
                f"({base_offset}).")
        # Determine how many poem lines to use.
        num_input_lines = round((input_len - base_offset) / avg_len)
        num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
        prefix_lines = self.data[:num_prefix_lines]
        samples = []
        while len(samples) < num_requests:
            extra_lines = random.choices(self.data,
                                         k=num_input_lines - num_prefix_lines)
            prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
            msg = [{"role": "user", "content": prompt}]
            prompt_formatted = tokenizer.apply_chat_template(
                msg, add_generation_prompt=True, tokenize=False)
            prompt_len = len(tokenizer(prompt_formatted).input_ids)
            if prompt_len <= input_len:
                samples.append(
                    SampleRequest(
                        prompt=prompt_formatted
                        if return_prompt_formatted else prompt,
                        prompt_len=prompt_len,
                        expected_output_len=output_len,
                    ))
        return samples
 # -----------------------------------------------------------------------------
 # BurstGPT Dataset Implementation
 # -----------------------------------------------------------------------------
 class BurstGPTDataset(BenchmarkDataset):
    """
    Implements the BurstGPT dataset.  Loads data from a CSV file and generates
    sample requests based on synthetic prompt generation. Only rows with Model
    "GPT-4" and positive response tokens are used.
    """
    def __init__(self, **kwargs) -> None:
        super().__init__(**kwargs)
        self.load_data()
    def load_data(self, ):
        if self.dataset_path is None:
            raise ValueError("dataset_path must be provided for loading data.")
        df = pd.read_csv(self.dataset_path)
        # Filter to keep only GPT-4 rows.
        gpt4_df = df[df["Model"] == "GPT-4"]
        # Remove failed requests (where Response tokens is 0 or less).
        gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
        # Sample the desired number of rows.
        self.data = gpt4_df
    def _sample_loaded_data(self, num_requests: int) -> list:
        if num_requests <= len(self.data):
            data = self.data.sample(n=num_requests,
                                    random_state=self.random_seed)
        else:
            data = self.data.sample(
                n=num_requests,
                random_state=self.random_seed,
                replace=True,
            )
        # Convert the dataframe to a list of lists.
        return data.values.tolist()
    def sample(
        self,
        tokenizer: PreTrainedTokenizerBase,
        num_requests: int,
        max_loras: Optional[int] = None,
        lora_path: Optional[str] = None,
        **kwargs,
    ) -> list[SampleRequest]:
        samples = []
        data = self._sample_loaded_data(num_requests=num_requests)
        for i in range(num_requests):
            input_len = int(data[i][2])
            output_len = int(data[i][3])
            lora_req, tokenizer = self.get_random_lora_request(
                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
            vocab_size = tokenizer.vocab_size
            # Generate a synthetic prompt: a list of token IDs computed as (i +
            # j) modulo vocab_size.
            token_ids = [(i + j) % vocab_size for j in range(input_len)]
            prompt = tokenizer.decode(token_ids)
            samples.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=input_len,
                    expected_output_len=output_len,
                    lora_request=lora_req,
                ))
        return samples
 # -----------------------------------------------------------------------------
 # HuggingFace Dataset Base Implementation
 # -----------------------------------------------------------------------------
 class HuggingFaceDataset(BenchmarkDataset):
    """Base class for datasets hosted on HuggingFace."""
    SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
    def __init__(
        self,
        dataset_path: str,
        dataset_split: str,
        dataset_subset: Optional[str] = None,
        **kwargs,
    ) -> None:
        super().__init__(dataset_path=dataset_path, **kwargs)
        self.dataset_split = dataset_split
        self.dataset_subset = dataset_subset
        self.load_data()
    def load_data(self) -> None:
        """Load data from HuggingFace datasets."""
        self.data = load_dataset(
            self.dataset_path,
            name=self.dataset_subset,
            split=self.dataset_split,
            streaming=True,
        )
        self.data = self.data.shuffle(seed=self.random_seed)
 # -----------------------------------------------------------------------------
 # Conversation Dataset Implementation
 # -----------------------------------------------------------------------------
 class ConversationDataset(HuggingFaceDataset):
    """Dataset for conversation data with multimodal support."""
    SUPPORTED_DATASET_PATHS = {
        'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
    }
    def sample(self,
               tokenizer: PreTrainedTokenizerBase,
               num_requests: int,
               output_len: Optional[int] = None,
               enable_multimodal_chat: bool = False,
               **kwargs) -> list:
        # Filter examples with at least 2 conversations
        filtered_data = self.data.filter(
            lambda x: len(x["conversations"]) >= 2)
        sampled_requests = []
        dynamic_output = output_len is None
        for item in filtered_data:
            if len(sampled_requests) >= num_requests:
                break
            conv = item["conversations"]
            prompt, completion = conv[0]["value"], conv[1]["value"]
            prompt_ids = tokenizer(prompt).input_ids
            completion_ids = tokenizer(completion).input_ids
            prompt_len = len(prompt_ids)
            completion_len = len(completion_ids)
            output_len = completion_len if dynamic_output else output_len
            assert isinstance(output_len, int) and output_len > 0
            if dynamic_output and not is_valid_sequence(
                    prompt_len, completion_len):
                continue
            mm_content = process_image(
                item["image"]) if "image" in item else None
            if enable_multimodal_chat:
                # Note: when chat is enabled the request prompt_len is no longer
                # accurate and we will be using request output to count the
                # actual prompt len and output len
                prompt = self.apply_multimodal_chat_transformation(
                    prompt, mm_content)
            sampled_requests.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=output_len,
                    multi_modal_data=mm_content,
                ))
        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests
 # -----------------------------------------------------------------------------
 # Vision Arena Dataset Implementation
 # -----------------------------------------------------------------------------
 class VisionArenaDataset(HuggingFaceDataset):
    """
    Vision Arena Dataset.
    """
    DEFAULT_OUTPUT_LEN = 128
    SUPPORTED_DATASET_PATHS = {
        "lmarena-ai/VisionArena-Chat":
        lambda x: x["conversation"][0][0]["content"],
        "lmarena-ai/vision-arena-bench-v0.1":
        lambda x: x["turns"][0][0]["content"]
    }
    def sample(
        self,
        tokenizer: PreTrainedTokenizerBase,
        num_requests: int,
        output_len: Optional[int] = None,
        enable_multimodal_chat: bool = False,
        **kwargs,
    ) -> list:
        output_len = (output_len
                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
        sampled_requests = []
        for item in self.data:
            if len(sampled_requests) >= num_requests:
                break
            parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
            if parser_fn is None:
                raise ValueError(
                    f"Unsupported dataset path: {self.dataset_path}")
            prompt = parser_fn(item)
            mm_content = process_image(item["images"][0])
            prompt_len = len(tokenizer(prompt).input_ids)
            if enable_multimodal_chat:
                # Note: when chat is enabled the request prompt_len is no longer
                # accurate and we will be using request output to count the
                # actual prompt len
                prompt = self.apply_multimodal_chat_transformation(
                    prompt, mm_content)
            sampled_requests.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=output_len,
                    multi_modal_data=mm_content,
                ))
        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests
 # -----------------------------------------------------------------------------
 # Instruct Coder Dataset Implementation
 # -----------------------------------------------------------------------------
 class InstructCoderDataset(HuggingFaceDataset):
    """
    InstructCoder Dataset.
    https://huggingface.co/datasets/likaixin/InstructCoder
    InstructCoder is the dataset designed for general code editing.  It consists
    of 114,239 instruction-input-output triplets, and covers multiple distinct
    code editing scenario.
    """
    DEFAULT_OUTPUT_LEN = 200  # this is the average default output length
    SUPPORTED_DATASET_PATHS = {
        "likaixin/InstructCoder",
    }
    def sample(self,
               tokenizer: PreTrainedTokenizerBase,
               num_requests: int,
               output_len: Optional[int] = None,
               enable_multimodal_chat: bool = False,
               **kwargs) -> list:
        output_len = (output_len
                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
        sampled_requests = []
        for item in self.data:
            if len(sampled_requests) >= num_requests:
                break
            prompt = f"{item['instruction']}:\n{item['input']}"
            prompt_len = len(tokenizer(prompt).input_ids)
            sampled_requests.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=output_len,
                ))
        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests
 # -----------------------------------------------------------------------------
 # AIMO Dataset Implementation
 # -----------------------------------------------------------------------------
 class AIMODataset(HuggingFaceDataset):
    """
    Dataset class for processing a AIMO dataset with reasoning questions.
    """
    SUPPORTED_DATASET_PATHS = {
        "AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5",
        "AI-MO/NuminaMath-CoT"
    }
    def sample(self,
               tokenizer: PreTrainedTokenizerBase,
               num_requests: int,
               output_len: Optional[int] = None,
               **kwargs) -> list:
        sampled_requests = []
        dynamic_output = output_len is None
        for item in self.data:
            if len(sampled_requests) >= num_requests:
                break
            prompt, completion = item['problem'], item["solution"]
            prompt_ids = tokenizer(prompt).input_ids
            completion_ids = tokenizer(completion).input_ids
            prompt_len = len(prompt_ids)
            completion_len = len(completion_ids)
            output_len = completion_len if dynamic_output else output_len
            assert isinstance(output_len, int) and output_len > 0
            if dynamic_output and not is_valid_sequence(prompt_len,
                                                        completion_len,
                                                        max_prompt_len=2048,
                                                        max_total_len=32000):
                continue
            sampled_requests.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=output_len,
                    multi_modal_data=None,
                ))
        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests
--- a/benchmarks/benchmark_guided.py
+++ b/benchmarks/benchmark_guided.py
@@ -1,495 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 """Benchmark guided decoding throughput."""
 import argparse
 import dataclasses
 import json
 import os
 import random
 import time
 from typing import List
 import datasets
 import pandas as pd
 import uvloop
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.openai.api_server import (
    build_async_engine_client_from_engine_args)
 from vllm.sampling_params import GuidedDecodingParams
 from vllm.utils import FlexibleArgumentParser, merge_async_iterators
@dataclasses.dataclass
 class SampleRequest:
    """A class representing a single inference request for benchmarking.
    Attributes:
        prompt: The input text prompt for the model.
        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
            images).
        prompt_len: The length of the prompt in tokens.
        expected_output_len: The expected length of the output in tokens.
    """
    prompt: str
    prompt_len: int
    expected_output_len: int
    schema: dict
    structure_type: str = 'json'
    completion: str = None
 def run_vllm(requests: List[SampleRequest],
             engine_args: EngineArgs,
             n: int,
             guided_decoding_rate: float = 1.0,
             warmup: bool = False) -> float:
    from vllm import LLM, SamplingParams
    llm = LLM(**vars(engine_args))
    # Add the requests to the engine.
    prompts: List[str] = []
    sampling_params: List[SamplingParams] = []
    # create a list containing random selected true or false
    guided_decoding_req_idx = random.sample(
        range(len(requests)), int(len(requests) * guided_decoding_rate))
    if warmup:
        print(">>>>> Running warmup prompt, for the first 5")
        # We setup the first 5 requests to warmup FSM
        # if using xgrammar dataset, we will skip warmup
        warmup_requests = requests[:5]
        for i, request in enumerate(warmup_requests):
            prompts.append(request.prompt)
            sampling_params.append(
                SamplingParams(
                    n=n,
                    temperature=1.0,
                    top_p=1.0,
                    ignore_eos=True,
                    max_tokens=request.expected_output_len,
                    guided_decoding=GuidedDecodingParams(json=request.schema)
                    if guided_decoding_rate > 0 else None,
                ))
        llm.generate(prompts, sampling_params, use_tqdm=False)
    print(">>>>> Benchmark started...")
    prompts = []
    sampling_params = []
    for i, request in enumerate(requests):
        prompts.append(request.prompt)
        sampling_params.append(
            SamplingParams(
                n=n,
                temperature=1.0,
                top_p=1.0,
                ignore_eos=True,
                max_tokens=request.expected_output_len,
                guided_decoding=GuidedDecodingParams(
                    **{request.structure_type: request.schema})
                if i in guided_decoding_req_idx else None,
            ))
    start = time.perf_counter()
    outputs = llm.generate(prompts, sampling_params, use_tqdm=False)
    ret = []
    for output, request in zip(outputs, requests):
        generated_text = output.outputs[0].text
        ret.append({
            "generated": generated_text,
            "expected": request.completion
        })
    end = time.perf_counter()
    return end - start, ret
 async def run_vllm_async(
        requests: List[SampleRequest],
        engine_args: AsyncEngineArgs,
        n: int,
        guided_decoding_rate: float = 1.0,
        warmup: bool = False,
        disable_frontend_multiprocessing: bool = False) -> float:
    from vllm import SamplingParams
    async with build_async_engine_client_from_engine_args(
            engine_args, disable_frontend_multiprocessing) as llm:
        # Add the requests to the engine.
        prompts: List[str] = []
        sampling_params: List[SamplingParams] = []
        guided_decoding_req_idx = random.sample(
            range(len(requests)), int(len(requests) * guided_decoding_rate))
        if warmup:
            print(">>>>>> Running warmup prompt, for the first 5")
            # We setup the first 5 requests to warmup FSM
            # if using xgrammar dataset, we will skip warmup
            warmup_requests = requests[:5]
            for i, request in enumerate(warmup_requests):
                prompts.append(request.prompt)
                sampling_params.append(
                    SamplingParams(
                        n=n,
                        temperature=1.0,
                        top_p=1.0,
                        ignore_eos=True,
                        max_tokens=request.expected_output_len,
                        guided_decoding=GuidedDecodingParams(
                            json=request.schema)
                        if guided_decoding_rate > 0 else None,
                    ))
            generators = []
            for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
                generator = llm.generate(prompt, sp, request_id=f"test{i}")
                generators.append(generator)
            all_gens = merge_async_iterators(*generators)
            async for i, res in all_gens:
                pass
        print(">>>>> Benchmark started...")
        prompts = []
        sampling_params = []
        for i, request in enumerate(requests):
            prompts.append(request.prompt)
            sampling_params.append(
                SamplingParams(
                    n=n,
                    temperature=1.0,
                    top_p=1.0,
                    ignore_eos=True,
                    max_tokens=request.expected_output_len,
                    guided_decoding=GuidedDecodingParams(json=request.schema)
                    if i in guided_decoding_req_idx else None,
                ))
        generators = []
        start_time = []
        latencies = []
        start = time.perf_counter()
        for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
            generator = llm.generate(prompt, sp, request_id=f"test{i}")
            generators.append(generator)
            start_time.append(time.perf_counter())
            latencies.append([])
        all_gens = merge_async_iterators(*generators)
        generated_texts = [''] * len(requests)
        async for i, res in all_gens:
            generated_texts[i] = res.outputs[0].text
            lat = time.perf_counter() - start_time[i]
            latencies[i].append(lat)
        ret = [{
            'generated': gt,
            'expected': req.completion
        } for gt, req in zip(generated_texts, requests)]
        end = time.perf_counter()
        first_latency = pd.Series([lat[0] * 1000 for lat in latencies])
        next_latency = pd.Series([(lat[-1] - lat[0]) / len(lat[1:]) * 1000
                                  for lat in latencies])
        return end - start, ret, (first_latency, next_latency)
 def sample_requests(tokenizer: PreTrainedTokenizerBase,
                    args: argparse.Namespace) -> List[SampleRequest]:
    if args.dataset == 'json':
        if args.json_schema_path is None:
            dir_path = os.path.dirname(os.path.realpath(__file__))
            args.json_schema_path = os.path.join(dir_path,
                                                 "structured_schemas",
                                                 "structured_schema_1.json")
        with open(args.json_schema_path) as f:
            schema = json.load(f)
        prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501
        input_len = len(tokenizer(prompt).input_ids)
        print(f"Input length of the prompt: {input_len} tokens")
        requests = [
            SampleRequest(prompt=prompt,
                          prompt_len=input_len,
                          expected_output_len=args.output_len,
                          schema=schema,
                          structure_type=args.structure_type)
            for _ in range(args.num_prompts)
        ]
    elif args.dataset == "grammar":
        schema = """
            ?start: select_statement
            ?select_statement: "SELECT " column_list " FROM " table_name
            ?column_list: column_name ("," column_name)*
            ?table_name: identifier
            ?column_name: identifier
            ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
        """
        prompt = "Generate an SQL query to show the 'username' \
            and 'email' from the 'users' table."
        input_len = len(tokenizer(prompt).input_ids)
        print(f"Input length of the prompt: {input_len} tokens")
        requests = [
            SampleRequest(prompt=prompt,
                          prompt_len=input_len,
                          expected_output_len=args.output_len,
                          schema=schema,
                          structure_type=args.structure_type)
            for _ in range(args.num_prompts)
        ]
    elif args.dataset == "regex":
        regex = r"\w+@\w+\.com\n"
        args.regex = regex
        prompt = "Generate an email address for Alan Turing, \
            who works in Enigma. End in .com and new line. \
                Example result: alan.turing@enigma.com\n"
        input_len = len(tokenizer(prompt).input_ids)
        print(f"Input length of the prompt: {input_len} tokens")
        requests = [
            SampleRequest(prompt=prompt,
                          prompt_len=input_len,
                          expected_output_len=args.output_len,
                          schema=regex,
                          structure_type=args.structure_type)
            for _ in range(args.num_prompts)
        ]
    elif args.dataset == "choice":
        choice = ["Positive", "Negative"]
        args.choice = choice
        prompt = "Classify this sentiment: vLLM is wonderful!"
        input_len = len(tokenizer(prompt).input_ids)
        print(f"Input length of the prompt: {input_len} tokens")
        requests = [
            SampleRequest(prompt=prompt,
                          prompt_len=input_len,
                          expected_output_len=args.output_len,
                          schema=choice,
                          structure_type=args.structure_type)
            for _ in range(args.num_prompts)
        ]
    elif args.dataset == "xgrammar_bench":
        args.warmup = False
        requests: List[SampleRequest] = []
        dataset = datasets.load_dataset("NousResearch/json-mode-eval",
                                        split="train")
        print(f"dataset has {len(dataset)} entries")
        len_dataset = len(dataset)
        for data_point_idx in range(args.num_prompts):
            idx = data_point_idx
            while idx >= len_dataset:
                idx -= len_dataset
            schema = dataset["schema"][idx]
            prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
                                                   tokenize=False)
            input_len = len(tokenizer(prompt).input_ids)
            completion = dataset["completion"][idx]
            requests.append(
                SampleRequest(prompt=prompt,
                              prompt_len=input_len,
                              expected_output_len=args.output_len,
                              schema=schema,
                              completion=completion))
    return requests
 def evaluate(ret, args):
    def _eval_correctness_json(expected, actual):
        # extract json string from string using regex
        import re
        actual = actual.replace('\n', '').replace(' ', '').strip()
        try:
            actual = re.search(r'\{.*\}', actual).group()
            actual = json.loads(actual)
        except Exception:
            return False
        return True
    def _eval_correctness_choice(expected, actual):
        return actual in args.choice
    def _eval_correctness_regex(expected, actual):
        import re
        return re.match(args.regex, actual) is not None
    def _eval_correctness(expected, actual):
        if args.structure_type == 'json':
            return _eval_correctness_json(expected, actual)
        elif args.structure_type == 'regex':
            return _eval_correctness_regex(expected, actual)
        elif args.structure_type == 'choice':
            return _eval_correctness_choice(expected, actual)
        else:
            return None
    scores = []
    for res in ret:
        score = _eval_correctness(res['expected'], res['generated'])
        res['correctness'] = score
        scores.append(score)
    not_none_scores = [score for score in scores if score is not None]
    return (sum(not_none_scores) / len(not_none_scores) *
            100) if len(not_none_scores) > 0 else None
 def main(args: argparse.Namespace):
    print(args)
    random.seed(args.seed)
    # async engine is working for 'regex', 'choice' and 'grammar'
    if args.dataset == 'grammar':
        args.structure_type = 'grammar'
        args.async_engine = False
    elif args.dataset == 'regex':
        args.structure_type = 'regex'
        args.async_engine = False
    elif args.dataset == 'choice':
        args.structure_type = 'choice'
        args.async_engine = False
    else:
        args.structure_type = 'json'
    if args.no_guided_decoding:
        args.guided_decoding_ratio = 0
    if args.save_results:
        result_file_name = f'{args.guided_decoding_ratio}guided'
        result_file_name += f"_{args.model.split('/')[-1]}"
        result_file_name += f"_{args.dataset}"
        result_file_name += f"_{args.num_prompts}"
        result_file_name += f"_out{args.output_len}"
        result_file_name += f"_async{args.async_engine}"
        result_file_name += f"_warmup{args.warmup}"
        result_file_name += f"_chunkedprefill{args.enable_chunked_prefill}"
        result_file_name += ".txt"
    else:
        result_file_name = None
    # Synthesize a prompt with the given input length.
    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer, trust_remote_code=args.trust_remote_code)
    requests = sample_requests(tokenizer, args)
    if args.async_engine:
        engine_args = AsyncEngineArgs.from_cli_args(args)
        elapsed_time, ret, (first_latency, next_latency) = uvloop.run(
            run_vllm_async(requests, engine_args, args.n,
                           args.guided_decoding_ratio, args.warmup,
                           args.disable_frontend_multiprocessing))
    else:
        engine_args = EngineArgs.from_cli_args(args)
        elapsed_time, ret = run_vllm(requests, engine_args, args.n,
                                     args.guided_decoding_ratio, args.warmup)
        first_latency, next_latency = None, None
    score = evaluate(ret, args)
    total_num_tokens = sum(request.prompt_len + request.expected_output_len
                           for request in requests)
    total_output_tokens = sum(request.expected_output_len
                              for request in requests)
    if first_latency is not None:
        latency_breakdown = "\nFirst token latency(msecs):\n"
        latency_breakdown += f"{first_latency.describe()}"
        latency_breakdown += "\nNext token latency(msecs):\n"
        latency_breakdown += f"{next_latency.describe()}"
    print(
        f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
        f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
        f"{total_output_tokens / elapsed_time:.2f} output tokens/s",
        f"Correct rate is {score} %",
        f"{latency_breakdown if first_latency is not None else ''}")
    # Output JSON results if specified
    if args.output_json or result_file_name:
        results = {
            "elapsed_time": elapsed_time,
            "num_requests": len(requests),
            "total_num_tokens": total_num_tokens,
            "total_output_tokens": total_output_tokens,
            "requests_per_second": len(requests) / elapsed_time,
            "tokens_per_second": f"{total_num_tokens / elapsed_time:.2f}",
            "output_tokens_per_second":
            f"{total_output_tokens / elapsed_time:.2f}",
            "correct_rate(%)": score
        }
        results = {"outputs": ret, **results}
        if first_latency is not None:
            results["first_token_latency(msecs)"] = first_latency.describe(
            ).to_dict()
            results["next_token_latency(msecs)"] = next_latency.describe(
            ).to_dict()
        if args.output_json:
            with open(args.output_json, "w") as f:
                json.dump(results, f, indent=4)
        elif result_file_name:
            with open(result_file_name, "w") as f:
                json.dump(results, f, indent=4)
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(description="Benchmark guided decoding.")
    parser = AsyncEngineArgs.add_cli_args(parser)
    parser.add_argument("--output-len",
                        type=int,
                        default=512,
                        help="Output length for each request. Overrides the "
                        "output length from the dataset.")
    parser.add_argument(
        "--dataset",
        default='json',
        choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
    parser.add_argument("--json_schema_path",
                        type=str,
                        default=None,
                        help="Path to json schema.")
    parser.add_argument("--n",
                        type=int,
                        default=1,
                        help="Number of generated sequences per prompt.")
    parser.add_argument("--num-prompts",
                        type=int,
                        default=10,
                        help="Number of prompts to process.")
    parser.add_argument(
        '--output-json',
        type=str,
        default=None,
        help='Path to save the throughput results in JSON format.')
    parser.add_argument("--async-engine",
                        action='store_true',
                        default=False,
                        help="Use vLLM async engine rather than LLM class.")
    parser.add_argument("--no-guided-decoding",
                        action='store_true',
                        default=False,
                        help="Whether to disable JSON decoding or not.")
    parser.add_argument("--guided-decoding-ratio",
                        type=float,
                        default=1.0,
                        help="Ratio of Guided Decoding requests")
    parser.add_argument("--disable-frontend-multiprocessing",
                        action='store_true',
                        default=False,
                        help="Disable decoupled async engine frontend.")
    parser.add_argument("--warmup",
                        action="store_true",
                        default=False,
                        help="Run warmup prompts before benchmark.")
    parser.add_argument("--save-results",
                        action="store_true",
                        default=False,
                        help="save output results.")
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model
    main(args)
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -7,11 +7,11 @@ import json
 import os
 import time
 from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import Any, Optional
 import numpy as np
 import torch
-from benchmark_utils import convert_to_pytorch_benchmark_format
+from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from tqdm import tqdm
 from vllm import LLM, SamplingParams
@@ -22,7 +22,7 @@ from vllm.utils import FlexibleArgumentParser
 def save_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                     results: Dict[str, Any]) -> None:
+                                     results: dict[str, Any]) -> None:
    pt_records = convert_to_pytorch_benchmark_format(
        args=args,
        metrics={"latency": results["latencies"]},
@@ -30,8 +30,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
                    for k in ["avg_latency", "percentiles"]})
    if pt_records:
        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
-        with open(pt_file, "w") as f:
+        write_to_json(pt_file, pt_records)
            json.dump(pt_records, f)
 def main(args: argparse.Namespace):
@@ -42,6 +41,10 @@ def main(args: argparse.Namespace):
    # NOTE(woosuk): If the request cannot be processed in a single batch,
    # the engine will automatically process the request in multiple batches.
    llm = LLM(**dataclasses.asdict(engine_args))
    assert llm.llm_engine.model_config.max_model_len >= (
        args.input_len +
        args.output_len), ("Please ensure that max_model_len is greater than"
                           " the sum of input_len and output_len.")
    sampling_params = SamplingParams(
        n=args.n,
@@ -49,12 +52,13 @@ def main(args: argparse.Namespace):
        top_p=1.0,
        ignore_eos=True,
        max_tokens=args.output_len,
        detokenize=not args.disable_detokenize,
    )
    print(sampling_params)
    dummy_prompt_token_ids = np.random.randint(10000,
                                               size=(args.batch_size,
                                                     args.input_len))
-    dummy_prompts: List[PromptType] = [{
+    dummy_prompts: list[PromptType] = [{
        "prompt_token_ids": batch
    } for batch in dummy_prompt_token_ids.tolist()]
@@ -170,6 +174,12 @@ if __name__ == "__main__":
        default=None,
        help="Path to save the latency results in JSON format.",
    )
    parser.add_argument(
        "--disable-detokenize",
        action="store_true",
        help=("Do not detokenize responses (i.e. do not include "
              "detokenization time in the latency measurement)"),
    )
    parser = EngineArgs.add_cli_args(parser)
    args = parser.parse_args()
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -31,7 +31,7 @@ import dataclasses
 import json
 import random
 import time
-from typing import List, Optional, Tuple
+from typing import Optional
 from transformers import PreTrainedTokenizerBase
@@ -77,9 +77,9 @@ def sample_requests_from_dataset(
    dataset_path: str,
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
-    input_length_range: Tuple[int, int],
+    input_length_range: tuple[int, int],
    fixed_output_len: Optional[int],
-) -> List[Request]:
+) -> list[Request]:
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")
@@ -99,7 +99,7 @@ def sample_requests_from_dataset(
    assert min_len >= 0 and max_len >= min_len, "input_length_range too small"
    # Filter out sequences that are too long or too short
-    filtered_requests: List[Request] = []
+    filtered_requests: list[Request] = []
    for i in range(len(dataset)):
        if len(filtered_requests) == num_requests:
@@ -122,10 +122,10 @@ def sample_requests_from_dataset(
 def sample_requests_from_random(
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
-    input_length_range: Tuple[int, int],
+    input_length_range: tuple[int, int],
    fixed_output_len: Optional[int],
    prefix_len: int,
-) -> List[Request]:
+) -> list[Request]:
    requests = []
    prefix_token_ids = sample_tokens(tokenizer, prefix_len)
@@ -144,9 +144,9 @@ def sample_requests_from_random(
    return requests
-def repeat_and_sort_requests(requests: List[Request],
+def repeat_and_sort_requests(requests: list[Request],
                             repeat_count: int,
-                             sort: bool = False) -> List[str]:
+                             sort: bool = False) -> list[str]:
    repeated_requests = requests * repeat_count
    if sort:
        repeated_requests.sort(key=lambda x: x[1])
@@ -194,7 +194,9 @@ def main(args):
    llm = LLM(**dataclasses.asdict(engine_args))
-    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+    sampling_params = SamplingParams(temperature=0,
                                     max_tokens=args.output_len,
                                     detokenize=not args.disable_detokenize)
    print("Testing filtered requests")
    prompts = repeat_and_sort_requests(filtered_requests,
@@ -243,6 +245,12 @@ if __name__ == "__main__":
        "subtract this length when filtering prompts. Only used "
        "when dataset-path is not provided.",
    )
    parser.add_argument(
        '--disable-detokenize',
        action='store_true',
        help=("Do not detokenize responses (i.e. do not include "
              "detokenization time in the latency measurement)"),
    )
    parser = EngineArgs.add_cli_args(parser)
    args = parser.parse_args()
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@@ -5,7 +5,7 @@ import dataclasses
 import json
 import random
 import time
-from typing import List, Optional, Tuple
+from typing import Optional
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
@@ -13,12 +13,17 @@ from vllm.engine.arg_utils import EngineArgs
 from vllm.utils import FlexibleArgumentParser
 #Select a equi-probable random priority
 def get_random_flag():
    return 0 if random.random() < 0.5 else 1
 def sample_requests(
    dataset_path: str,
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
    fixed_output_len: Optional[int],
-) -> List[Tuple[str, int, int]]:
+) -> list[tuple[str, int, int, int]]:
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")
@@ -35,7 +40,7 @@ def sample_requests(
    random.shuffle(dataset)
    # Filter out sequences that are too long or too short
-    filtered_dataset: List[Tuple[str, int, int]] = []
+    filtered_dataset: list[tuple[str, int, int]] = []
    for i in range(len(dataset)):
        if len(filtered_dataset) == num_requests:
            break
@@ -55,8 +60,7 @@ def sample_requests(
            # Prune too long sequences.
            continue
-        #Select a equi-probable random priority
+        priority = get_random_flag()
        priority = 0 if random.random() < 0.5 else 1
        filtered_dataset.append((prompt, prompt_len, output_len, priority))
@@ -64,13 +68,20 @@ def sample_requests(
 def run_vllm(
-    requests: List[Tuple[str, int, int]],
+    requests: list[tuple[str, int, int]],
    n: int,
    engine_args: EngineArgs,
    disable_detokenize: bool = False,
 ) -> float:
    from vllm import LLM, SamplingParams
    llm = LLM(**dataclasses.asdict(engine_args))
    assert all(
        llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
        for request in requests), (
            "Please ensure that max_model_len is greater than the sum of"
            " input_len and output_len for all requests.")
    # Add the requests to the engine.
    prompts = []
    sampling_params = []
@@ -85,6 +96,7 @@ def run_vllm(
                top_p=1.0,
                ignore_eos=True,
                max_tokens=output_len,
                detokenize=not disable_detokenize,
            ))
    start = time.perf_counter()
@@ -103,15 +115,16 @@ def main(args: argparse.Namespace):
    if args.dataset is None:
        # Synthesize a prompt with the given input length.
        prompt = "hi" * (args.input_len - 1)
-        requests = [(prompt, args.input_len, args.output_len)
+        requests = [(prompt, args.input_len, args.output_len,
-                    for _ in range(args.num_prompts)]
+                     get_random_flag()) for _ in range(args.num_prompts)]
    else:
        requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
                                   args.output_len)
    if args.backend == "vllm":
        elapsed_time = run_vllm(requests, args.n,
-                                EngineArgs.from_cli_args(args))
+                                EngineArgs.from_cli_args(args),
                                args.disable_detokenize)
    else:
        raise ValueError(f"Unknown backend: {args.backend}")
    total_num_tokens = sum(prompt_len + output_len
@@ -164,6 +177,12 @@ if __name__ == "__main__":
        type=str,
        default=None,
        help='Path to save the throughput results in JSON format.')
    parser.add_argument(
        '--disable-detokenize',
        action='store_true',
        help=("Do not detokenize responses (i.e. do not include "
              "detokenization time in the latency measurement)"),
    )
    parser = EngineArgs.add_cli_args(parser)
    args = parser.parse_args()
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -7,9 +7,6 @@ On the server side, run one of the following commands:
        --swap-space 16 \
        --disable-log-requests
    (TGI backend)
    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
 On the client side, run:
    python benchmarks/benchmark_serving.py \
        --backend <backend> \
@@ -25,24 +22,21 @@ On the client side, run:
 """
 import argparse
 import asyncio
 import base64
 import gc
 import io
 import json
 import os
 import random
 import time
 import warnings
 from collections.abc import AsyncGenerator, Iterable
 from dataclasses import dataclass
 from datetime import datetime
-from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple
+from typing import Any, Optional
 import numpy as np
-import pandas as pd
+from backend_request_func import (ASYNC_REQUEST_FUNCS,
-from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
+                                  OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
                                  RequestFuncOutput)
 from datasets import load_dataset
 from PIL.Image import Image
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
@@ -56,7 +50,12 @@ try:
 except ImportError:
    from argparse import ArgumentParser as FlexibleArgumentParser
-from benchmark_utils import convert_to_pytorch_benchmark_format
+from benchmark_dataset import (AIMODataset, BurstGPTDataset,
                               ConversationDataset, HuggingFaceDataset,
                               InstructCoderDataset, RandomDataset,
                               SampleRequest, ShareGPTDataset, SonnetDataset,
                               VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -73,343 +72,36 @@ class BenchmarkMetrics:
    mean_ttft_ms: float
    median_ttft_ms: float
    std_ttft_ms: float
-    percentiles_ttft_ms: List[Tuple[float, float]]
+    percentiles_ttft_ms: list[tuple[float, float]]
    mean_tpot_ms: float
    median_tpot_ms: float
    std_tpot_ms: float
-    percentiles_tpot_ms: List[Tuple[float, float]]
+    percentiles_tpot_ms: list[tuple[float, float]]
    mean_itl_ms: float
    median_itl_ms: float
    std_itl_ms: float
-    percentiles_itl_ms: List[Tuple[float, float]]
+    percentiles_itl_ms: list[tuple[float, float]]
    # E2EL stands for end-to-end latency per request.
    # It is the time taken on the client side from sending
    # a request to receiving a complete response.
    mean_e2el_ms: float
    median_e2el_ms: float
    std_e2el_ms: float
-    percentiles_e2el_ms: List[Tuple[float, float]]
+    percentiles_e2el_ms: list[tuple[float, float]]
 def sample_sharegpt_requests(
    dataset_path: str,
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
    fixed_output_len: Optional[int] = None,
 ) -> List[Tuple[str, int, int, None]]:
    # Load the dataset.
    with open(dataset_path, encoding='utf-8') as f:
        dataset = json.load(f)
    # Filter out the conversations with less than 2 turns.
    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
    # Only keep the first two turns of each conversation.
    dataset = [(data["conversations"][0]["value"],
                data["conversations"][1]["value"]) for data in dataset]
    # Shuffle the dataset.
    random.shuffle(dataset)
    # Filter out sequences that are too long or too short
    filtered_dataset: List[Tuple[str, int, int]] = []
    for i in range(len(dataset)):
        if len(filtered_dataset) == num_requests:
            break
        # Tokenize the prompts and completions.
        prompt = dataset[i][0]
        prompt_token_ids = tokenizer(prompt).input_ids
        completion = dataset[i][1]
        completion_token_ids = tokenizer(completion).input_ids
        prompt_len = len(prompt_token_ids)
        output_len = len(completion_token_ids
                         ) if fixed_output_len is None else fixed_output_len
        if prompt_len < 4 or (fixed_output_len is None and output_len < 4):
            # Prune too short sequences.
            continue
        if prompt_len > 1024 or prompt_len + output_len > 2048:
            # Prune too long sequences.
            continue
        filtered_dataset.append((prompt, prompt_len, output_len, None))
    return filtered_dataset
 def sample_burstgpt_requests(
    dataset_path: str,
    num_requests: int,
    random_seed: int,
    tokenizer: PreTrainedTokenizerBase,
 ) -> List[Tuple[str, int, int, None]]:
    df = pd.read_csv(dataset_path)
    gpt4_df = df[df["Model"] == "GPT-4"]
    # Remove the failed requests (i.e., response length is 0)
    gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
    # Randomly sample num_requests from the dataset
    if num_requests <= len(gpt4_df):
        gpt4_df = gpt4_df.sample(n=num_requests, random_state=random_seed)
    else:
        gpt4_df = gpt4_df.sample(n=num_requests,
                                 random_state=random_seed,
                                 replace=True)
    # Convert the dataframe to a list of tuples
    dataset = gpt4_df.values.tolist()
    input_requests = []
    for i in range(num_requests):
        input_len = int(dataset[i][2])
        output_len = int(dataset[i][3])
        prompt = tokenizer.decode([(i + j) % tokenizer.vocab_size
                                   for j in range(input_len)])
        input_requests.append((prompt, input_len, output_len, None))
    return input_requests
 def sample_sonnet_requests(
    dataset_path: str,
    num_requests: int,
    input_len: int,
    output_len: int,
    prefix_len: int,
    tokenizer: PreTrainedTokenizerBase,
 ) -> List[Tuple[str, str, int, int, None]]:
    assert (
        input_len > prefix_len
    ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
    # Load the dataset.
    with open(dataset_path, encoding='utf-8') as f:
        poem_lines = f.readlines()
    # Tokenize the poem lines.
    poem_token_ids = tokenizer(poem_lines).input_ids
    average_poem_len = sum(
        len(token_ids) for token_ids in poem_token_ids) / len(poem_token_ids)
    # Base prefix for all requests.
    base_prompt = "Pick as many lines as you can from these poem lines:\n"
    base_message = [{
        "role": "user",
        "content": base_prompt,
    }]
    base_prompt_formatted = tokenizer.apply_chat_template(
        base_message, add_generation_prompt=True, tokenize=False)
    base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids)
    assert (
        input_len > base_prompt_offset
    ), f"Please set 'args.sonnet-input-len' higher than {base_prompt_offset}."
    num_input_lines = round(
        (input_len - base_prompt_offset) / average_poem_len)
    # First approximately `prefix_len` number of tokens in the
    # prompt are fixed poem lines.
    assert (
        prefix_len > base_prompt_offset
    ), f"Please set 'args.sonnet-prefix-len' higher than {base_prompt_offset}."
    num_prefix_lines = round(
        (prefix_len - base_prompt_offset) / average_poem_len)
    prefix_lines = poem_lines[:num_prefix_lines]
    # Sample the rest of lines per request.
    sampled_requests: List[Tuple[str, int, int]] = []
    for _ in range(num_requests):
        num_lines_needed = num_input_lines - num_prefix_lines
        sampled_lines = "".join(prefix_lines +
                                random.choices(poem_lines, k=num_lines_needed))
        prompt = f"{base_prompt}{sampled_lines}"
        message = [
            {
                "role": "user",
                "content": prompt,
            },
        ]
        prompt_formatted = tokenizer.apply_chat_template(
            message, add_generation_prompt=True, tokenize=False)
        prompt_len = len(tokenizer(prompt_formatted).input_ids)
        sampled_requests.append(
            (prompt, prompt_formatted, prompt_len, output_len, None))
    return sampled_requests
 def sample_vision_arena_requests(
    dataset,
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
    fixed_output_len: Optional[int] = None,
 ) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
    sampled_requests: List[Tuple[str, int, int, Dict[str,
                                                     Collection[str]]]] = []
    for data in dataset:
        if len(sampled_requests) == num_requests:
            break
        prompt = data["turns"][0][0]['content']
        prompt_token_ids = tokenizer(prompt).input_ids
        if fixed_output_len is None:
            # Default max output len is set to 128
            print("--hf-output-len is not provided. Using default value 128.")
            fixed_output_len = 128
        prompt_len = len(prompt_token_ids)
        output_len = fixed_output_len
        assert isinstance(
            data["images"][0],
            Image), ("Input image format must be `PIL.Image.Image`, "
                     f"given {type(data['image'])}.")
        image: Image = data["images"][0]
        image = image.convert("RGB")
        image_data = io.BytesIO()
        image.save(image_data, format='JPEG')
        image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
        mm_content = {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{image_base64}"
            },
        }
        sampled_requests.append((prompt, prompt_len, output_len, mm_content))
    return sampled_requests
 def sample_hf_requests(
    dataset_path: str,
    dataset_subset: Optional[str],
    dataset_split: str,
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
    random_seed: int,
    fixed_output_len: Optional[int] = None,
 ) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
    # Special case for vision_arena dataset
    if dataset_path == 'lmarena-ai/vision-arena-bench-v0.1' \
        and dataset_subset is None:
        assert dataset_split == "train"
        dataset = load_dataset(dataset_path,
                               name=dataset_subset,
                               split=dataset_split,
                               streaming=True)
        dataset = dataset.shuffle(seed=random_seed)
        return sample_vision_arena_requests(dataset, num_requests, tokenizer,
                                            fixed_output_len)
    dataset = load_dataset(dataset_path,
                           name=dataset_subset,
                           split=dataset_split,
                           streaming=True)
    assert "conversations" in dataset.features, (
        "HF Dataset must have 'conversations' column.")
    filter_func = lambda x: len(x["conversations"]) >= 2
    filtered_dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
    sampled_requests: List[Tuple[str, int, int, Dict[str,
                                                     Collection[str]]]] = []
    for data in filtered_dataset:
        if len(sampled_requests) == num_requests:
            break
        # Tokenize the prompts and completions.
        prompt = data["conversations"][0]["value"]
        prompt_token_ids = tokenizer(prompt).input_ids
        completion = data["conversations"][1]["value"]
        completion_token_ids = tokenizer(completion).input_ids
        prompt_len = len(prompt_token_ids)
        output_len = len(completion_token_ids
                         ) if fixed_output_len is None else fixed_output_len
        if fixed_output_len is None and (prompt_len < 4 or output_len < 4):
            # Prune too short sequences.
            continue
        if fixed_output_len is None and \
            (prompt_len > 1024 or prompt_len + output_len > 2048):
            # Prune too long sequences.
            continue
        if "image" in data and isinstance(data["image"], Image):
            image: Image = data["image"]
            image = image.convert("RGB")
            image_data = io.BytesIO()
            image.save(image_data, format='JPEG')
            image_base64 = base64.b64encode(
                image_data.getvalue()).decode("utf-8")
            mm_content = {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{image_base64}"
                },
            }
        elif "image" in data and isinstance(data["image"], str):
            if (data["image"].startswith("http://") or \
                data["image"].startswith("file://")):
                image_url = data["image"]
            else:
                image_url = f"file://{data['image']}"
            mm_content = {
                "type": "image_url",
                "image_url": {
                    "url": image_url
                },
            }
        else:
            mm_content = None
        sampled_requests.append((prompt, prompt_len, output_len, mm_content))
    return sampled_requests
 def sample_random_requests(
    prefix_len: int,
    input_len: int,
    output_len: int,
    num_prompts: int,
    range_ratio: float,
    tokenizer: PreTrainedTokenizerBase,
 ) -> List[Tuple[str, int, int]]:
    prefix_token_ids = np.random.randint(0,
                                         tokenizer.vocab_size,
                                         size=prefix_len).tolist()
    input_lens = np.random.randint(
        int(input_len * range_ratio),
        input_len + 1,
        size=num_prompts,
    )
    output_lens = np.random.randint(
        int(output_len * range_ratio),
        output_len + 1,
        size=num_prompts,
    )
    offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
    input_requests = []
    for i in range(num_prompts):
        prompt = tokenizer.decode(prefix_token_ids +
                                  [(offsets[i] + i + j) % tokenizer.vocab_size
                                   for j in range(input_lens[i])])
        input_requests.append((prompt, int(prefix_len + input_lens[i]),
                               int(output_lens[i]), None))
    return input_requests
 async def get_request(
-    input_requests: List[Tuple[str, int, int]],
+    input_requests: list[SampleRequest],
    request_rate: float,
    burstiness: float = 1.0,
-) -> AsyncGenerator[Tuple[str, int, int], None]:
+) -> AsyncGenerator[SampleRequest, None]:
    """
    Asynchronously generates requests at a specified rate
    with OPTIONAL burstiness.
    Args:
        input_requests:
-            A list of input requests, each represented as a tuple.
+            A list of input requests, each represented as a SampleRequest.
        request_rate:
            The rate at which requests are generated (requests/s).
        burstiness (optional):
@@ -421,7 +113,7 @@ async def get_request(
            in more bursty requests, while a higher burstiness value
            (burstiness > 1) results in a more uniform arrival of requests.
    """
-    input_requests = iter(input_requests)
+    input_requests: Iterable[SampleRequest] = iter(input_requests)
    # Calculate scale parameter theta to maintain the desired request_rate.
    assert burstiness > 0, (
@@ -443,28 +135,28 @@ async def get_request(
 def calculate_metrics(
-    input_requests: List[Tuple[str, int, int]],
+    input_requests: list[SampleRequest],
-    outputs: List[RequestFuncOutput],
+    outputs: list[RequestFuncOutput],
    dur_s: float,
    tokenizer: PreTrainedTokenizerBase,
-    selected_percentile_metrics: List[str],
+    selected_percentile_metrics: list[str],
-    selected_percentiles: List[float],
+    selected_percentiles: list[float],
-    goodput_config_dict: Dict[str, float],
+    goodput_config_dict: dict[str, float],
-) -> Tuple[BenchmarkMetrics, List[int]]:
+) -> tuple[BenchmarkMetrics, list[int]]:
-    actual_output_lens: List[int] = []
+    actual_output_lens: list[int] = []
    total_input = 0
    completed = 0
    good_completed = 0
-    itls: List[float] = []
+    itls: list[float] = []
-    tpots: List[float] = []
+    tpots: list[float] = []
-    all_tpots: List[float] = []
+    all_tpots: list[float] = []
-    ttfts: List[float] = []
+    ttfts: list[float] = []
-    e2els: List[float] = []
+    e2els: list[float] = []
    for i in range(len(outputs)):
        if outputs[i].success:
            output_len = outputs[i].output_tokens
-            if output_len is None:
+            if not output_len:
                # We use the tokenizer to count the number of output tokens
                # for some serving backends instead of looking at
                # len(outputs[i].itl) since multiple output tokens may be
@@ -474,7 +166,7 @@ def calculate_metrics(
                    tokenizer(outputs[i].generated_text,
                              add_special_tokens=False).input_ids)
            actual_output_lens.append(output_len)
-            total_input += input_requests[i][1]
+            total_input += input_requests[i].prompt_len
            tpot = 0
            if output_len > 1:
                latency_minus_ttft = outputs[i].latency - outputs[i].ttft
@@ -557,19 +249,19 @@ async def benchmark(
    model_id: str,
    model_name: str,
    tokenizer: PreTrainedTokenizerBase,
-    input_requests: List[Tuple[str, int, int]],
+    input_requests: list[SampleRequest],
    logprobs: Optional[int],
    best_of: int,
    request_rate: float,
    burstiness: float,
    disable_tqdm: bool,
    profile: bool,
-    selected_percentile_metrics: List[str],
+    selected_percentile_metrics: list[str],
-    selected_percentiles: List[str],
+    selected_percentiles: list[float],
    ignore_eos: bool,
-    goodput_config_dict: Dict[str, float],
+    goodput_config_dict: dict[str, float],
    max_concurrency: Optional[int],
-    lora_modules: Optional[List[str]],
+    lora_modules: Optional[Iterable[str]],
    extra_body: Optional[dict],
 ):
    if backend in ASYNC_REQUEST_FUNCS:
        request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -577,12 +269,16 @@ async def benchmark(
        raise ValueError(f"Unknown backend: {backend}")
    print("Starting initial single prompt test run...")
-    test_prompt, test_prompt_len, test_output_len, test_mm_content = (
+    test_prompt, test_prompt_len, test_output_len, test_mm_content = \
-        input_requests[0])
+        input_requests[0].prompt, input_requests[0].prompt_len, \
        input_requests[0].expected_output_len, \
            input_requests[0].multi_modal_data
    if backend != "openai-chat" and test_mm_content is not None:
        # multi-modal benchmark is only available on OpenAI Chat backend.
        raise ValueError(
            "Multi-modal content is only supported on 'openai-chat' backend.")
    assert test_mm_content is None or isinstance(test_mm_content, dict)
    test_input = RequestFuncInput(
        model=model_id,
        model_name=model_name,
@@ -591,9 +287,9 @@ async def benchmark(
        prompt_len=test_prompt_len,
        output_len=test_output_len,
        logprobs=logprobs,
        best_of=best_of,
        multi_modal_content=test_mm_content,
        ignore_eos=ignore_eos,
        extra_body=extra_body,
    )
    test_output = await request_func(request_func_input=test_input)
@@ -607,7 +303,8 @@ async def benchmark(
    if lora_modules:
        # For each input request, choose a LoRA module at random.
        lora_modules = iter(
-            [random.choice(lora_modules) for _ in range(len(input_requests))])
+            [random.choice(lora_modules) \
                for _ in range(len(input_requests))])
    if profile:
        print("Starting profiler...")
@@ -618,9 +315,9 @@ async def benchmark(
                                         prompt_len=test_prompt_len,
                                         output_len=test_output_len,
                                         logprobs=logprobs,
                                         best_of=best_of,
                                         multi_modal_content=test_mm_content,
-                                         ignore_eos=ignore_eos)
+                                         ignore_eos=ignore_eos,
                                         extra_body=extra_body)
        profile_output = await request_func(request_func_input=profile_input)
        if profile_output.success:
            print("Profiler started")
@@ -652,9 +349,11 @@ async def benchmark(
                                      pbar=pbar)
    benchmark_start_time = time.perf_counter()
-    tasks: List[asyncio.Task] = []
+    tasks: list[asyncio.Task] = []
    async for request in get_request(input_requests, request_rate, burstiness):
-        prompt, prompt_len, output_len, mm_content = request
+        prompt, prompt_len, output_len, mm_content = request.prompt, \
            request.prompt_len, request.expected_output_len, \
                request.multi_modal_data
        req_model_id, req_model_name = model_id, model_name
        if lora_modules:
            req_lora_module = next(lora_modules)
@@ -667,14 +366,14 @@ async def benchmark(
                                              prompt_len=prompt_len,
                                              output_len=output_len,
                                              logprobs=logprobs,
                                              best_of=best_of,
                                              multi_modal_content=mm_content,
-                                              ignore_eos=ignore_eos)
+                                              ignore_eos=ignore_eos,
                                              extra_body=extra_body)
        tasks.append(
            asyncio.create_task(
                limited_request_func(request_func_input=request_func_input,
                                     pbar=pbar)))
-    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
+    outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
    if profile:
        print("Stopping profiler...")
@@ -685,7 +384,6 @@ async def benchmark(
            prompt_len=test_prompt_len,
            output_len=test_output_len,
            logprobs=logprobs,
            best_of=best_of,
        )
        profile_output = await request_func(request_func_input=profile_input)
        if profile_output.success:
@@ -820,7 +518,7 @@ def parse_goodput(slo_pairs):
 def save_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                     results: Dict[str, Any],
+                                     results: dict[str, Any],
                                     file_name: str) -> None:
    metrics = [
        "median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
@@ -841,8 +539,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
    if pt_records:
        # Don't use json suffix here as we don't want CI to pick it up
        pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
-        with open(pt_file, "w") as f:
+        write_to_json(pt_file, pt_records)
            json.dump(pt_records, f)
 def main(args: argparse.Namespace):
@@ -867,91 +564,119 @@ def main(args: argparse.Namespace):
                              tokenizer_mode=tokenizer_mode,
                              trust_remote_code=args.trust_remote_code)
-    if args.dataset is not None:
+    if args.dataset_name is None:
-        warnings.warn(
+        raise ValueError(
-            "The '--dataset' argument will be deprecated in the next "
+            "Please specify '--dataset-name' and the corresponding "
-            "release. Please use '--dataset-name' and "
+            "'--dataset-path' if required.")
            "'--dataset-path' in the future runs.",
            stacklevel=2)
        input_requests = sample_sharegpt_requests(
            dataset_path=args.dataset,
            num_requests=args.num_prompts,
            tokenizer=tokenizer,
            fixed_output_len=args.sharegpt_output_len,
        )
-    elif args.dataset_name == "sharegpt":
+    if args.dataset_name == "sonnet":
-        input_requests = sample_sharegpt_requests(
+        dataset = SonnetDataset(dataset_path=args.dataset_path)
-            dataset_path=args.dataset_path,
+        # For the "sonnet" dataset, formatting depends on the backend.
            num_requests=args.num_prompts,
            tokenizer=tokenizer,
            fixed_output_len=args.sharegpt_output_len,
        )
    elif args.dataset_name == "burstgpt":
        input_requests = sample_burstgpt_requests(
            dataset_path=args.dataset_path,
            num_requests=args.num_prompts,
            random_seed=args.seed,
            tokenizer=tokenizer,
        )
    elif args.dataset_name == "sonnet":
        # Do not format the prompt, pass to message directly
        if args.backend == "openai-chat":
-            input_requests = sample_sonnet_requests(
+            input_requests = dataset.sample(num_requests=args.num_prompts,
-                dataset_path=args.dataset_path,
+                                            input_len=args.sonnet_input_len,
-                num_requests=args.num_prompts,
+                                            output_len=args.sonnet_output_len,
-                input_len=args.sonnet_input_len,
+                                            prefix_len=args.sonnet_prefix_len,
-                output_len=args.sonnet_output_len,
+                                            tokenizer=tokenizer,
-                prefix_len=args.sonnet_prefix_len,
+                                            return_prompt_formatted=False)
                tokenizer=tokenizer,
            )
            input_requests = [(prompt, prompt_len, output_len, None)
                              for prompt, prompt_formatted, prompt_len,
                              output_len, _ in input_requests]
        else:
-            assert (
+            assert tokenizer.chat_template or tokenizer.default_chat_template, (
-                tokenizer.chat_template or tokenizer.default_chat_template
+                "Tokenizer/model must have chat template for sonnet dataset.")
-            ), "Tokenizer/model must have chat template for sonnet dataset."
+            input_requests = dataset.sample(num_requests=args.num_prompts,
-            input_requests = sample_sonnet_requests(
+                                            input_len=args.sonnet_input_len,
-                dataset_path=args.dataset_path,
+                                            output_len=args.sonnet_output_len,
-                num_requests=args.num_prompts,
+                                            prefix_len=args.sonnet_prefix_len,
-                input_len=args.sonnet_input_len,
+                                            tokenizer=tokenizer,
-                output_len=args.sonnet_output_len,
+                                            return_prompt_formatted=True)
                prefix_len=args.sonnet_prefix_len,
                tokenizer=tokenizer,
            )
            input_requests = [(prompt_formatted, prompt_len, output_len, None)
                              for prompt, prompt_formatted, prompt_len,
                              output_len, _ in input_requests]
    elif args.dataset_name == "hf":
-        input_requests = sample_hf_requests(
+        # all following datasets are implemented from the
        # HuggingFaceDataset base class
        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
            dataset_class = VisionArenaDataset
            args.hf_split = "train"
            args.hf_subset = None
        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
            dataset_class = InstructCoderDataset
            args.hf_split = "train"
        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
            dataset_class = ConversationDataset
        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
            dataset_class = AIMODataset
            args.hf_split = "train"
        else:
            supported_datasets = set([
                dataset_name for cls in HuggingFaceDataset.__subclasses__()
                for dataset_name in cls.SUPPORTED_DATASET_PATHS
            ])
            raise ValueError(
                f"Unsupported dataset path: {args.dataset_path}. "
                "Huggingface dataset only supports dataset_path"
                f" from one of following: {supported_datasets}. "
                "Please consider contributing if you would "
                "like to add support for additional dataset formats.")
        input_requests = dataset_class(
            dataset_path=args.dataset_path,
            dataset_subset=args.hf_subset,
            dataset_split=args.hf_split,
            random_seed=args.seed,
        ).sample(
            num_requests=args.num_prompts,
            tokenizer=tokenizer,
-            random_seed=args.seed,
+            output_len=args.hf_output_len,
            fixed_output_len=args.hf_output_len,
        )
    elif args.dataset_name == "random":
        input_requests = sample_random_requests(
            prefix_len=args.random_prefix_len,
            input_len=args.random_input_len,
            output_len=args.random_output_len,
            num_prompts=args.num_prompts,
            range_ratio=args.random_range_ratio,
            tokenizer=tokenizer,
        )
    else:
-        raise ValueError(f"Unknown dataset: {args.dataset_name}")
+        # For datasets that follow a similar structure, use a mapping.
        dataset_mapping = {
            "sharegpt":
            lambda: ShareGPTDataset(random_seed=args.seed,
                                    dataset_path=args.dataset_path).sample(
                                        tokenizer=tokenizer,
                                        num_requests=args.num_prompts,
                                        output_len=args.sharegpt_output_len,
                                    ),
            "burstgpt":
            lambda: BurstGPTDataset(random_seed=args.seed,
                                    dataset_path=args.dataset_path).
            sample(tokenizer=tokenizer, num_requests=args.num_prompts),
            "random":
            lambda: RandomDataset(dataset_path=args.dataset_path).sample(
                tokenizer=tokenizer,
                num_requests=args.num_prompts,
                prefix_len=args.random_prefix_len,
                input_len=args.random_input_len,
                output_len=args.random_output_len,
                range_ratio=args.random_range_ratio,
            )
        }
        try:
            input_requests = dataset_mapping[args.dataset_name]()
        except KeyError as err:
            raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
    goodput_config_dict = check_goodput_args(args)
    # Collect the sampling parameters.
    sampling_params = {
        k: v
        for k, v in {
            "top_p": args.top_p,
            "top_k": args.top_k,
            "min_p": args.min_p,
            "temperature": args.temperature
        }.items() if v is not None
    }
    # Sampling parameters are only supported by openai-compatible backend.
    if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
        raise ValueError(
            "Sampling parameters are only supported by openai-compatible "
            "backends.")
    if "temperature" not in sampling_params:
        sampling_params["temperature"] = 0.0  # Default to greedy decoding.
    # Avoid GC processing "static" data - reduce pause times.
    gc.collect()
    gc.freeze()
@@ -966,7 +691,6 @@ def main(args: argparse.Namespace):
            tokenizer=tokenizer,
            input_requests=input_requests,
            logprobs=args.logprobs,
            best_of=args.best_of,
            request_rate=args.request_rate,
            burstiness=args.burstiness,
            disable_tqdm=args.disable_tqdm,
@@ -979,11 +703,12 @@ def main(args: argparse.Namespace):
            goodput_config_dict=goodput_config_dict,
            max_concurrency=args.max_concurrency,
            lora_modules=args.lora_modules,
            extra_body=sampling_params,
        ))
    # Save config and results to json
    if args.save_result:
-        result_json: Dict[str, Any] = {}
+        result_json: dict[str, Any] = {}
        # Setup
        current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
@@ -991,7 +716,6 @@ def main(args: argparse.Namespace):
        result_json["backend"] = backend
        result_json["model_id"] = model_id
        result_json["tokenizer_id"] = tokenizer_id
        result_json["best_of"] = args.best_of
        result_json["num_prompts"] = args.num_prompts
        # Metadata
@@ -1005,6 +729,15 @@ def main(args: argparse.Namespace):
                        "Invalid metadata format. Please use KEY=VALUE format."
                    )
        if not args.save_detailed:
            # Remove fields with too many data points
            for field in [
                    "input_lens", "output_lens", "ttfts", "itls",
                    "generated_texts", "errors"
            ]:
                if field in result_json:
                    del result_json[field]
        # Traffic
        result_json["request_rate"] = (args.request_rate if args.request_rate
                                       < float("inf") else "inf")
@@ -1052,13 +785,6 @@ if __name__ == "__main__":
        default="/v1/completions",
        help="API endpoint.",
    )
    parser.add_argument(
        "--dataset",
        type=str,
        default=None,
        help="Path to the ShareGPT dataset, will be deprecated in the "
        "next release.",
    )
    parser.add_argument(
        "--dataset-name",
        type=str,
@@ -1096,13 +822,6 @@ if __name__ == "__main__":
        help=
        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument(
        "--best-of",
        type=int,
        default=1,
        help="Generates `best_of` sequences per prompt and "
        "returns the best one.",
    )
    parser.add_argument("--use-beam-search", action="store_true")
    parser.add_argument(
        "--num-prompts",
@@ -1163,6 +882,12 @@ if __name__ == "__main__":
        action="store_true",
        help="Specify to save benchmark results to a json file",
    )
    parser.add_argument(
        "--save-detailed",
        action="store_true",
        help="When saving the results, whether to include per request "
        "information such as response, error, ttfs, tpots, etc.",
    )
    parser.add_argument(
        "--metadata",
        metavar="KEY=VALUE",
@@ -1196,7 +921,7 @@ if __name__ == "__main__":
        "--percentile-metrics",
        type=str,
        default="ttft,tpot,itl",
-        help="Comma-seperated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentils. "
        "This argument specifies the metrics to report percentiles. "
        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
        "Default value is \"ttft,tpot,itl\".")
@@ -1204,7 +929,7 @@ if __name__ == "__main__":
        "--metric-percentiles",
        type=str,
        default="99",
-        help="Comma-seperated list of percentiles for selected metrics. "
+        help="Comma-separated list of percentiles for selected metrics. "
        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
        "Default value is \"99\". "
        "Use \"--percentile-metrics\" to select metrics.",
@@ -1271,18 +996,23 @@ if __name__ == "__main__":
    random_group.add_argument(
        "--random-range-ratio",
        type=float,
-        default=1.0,
+        default=0.0,
-        help="Range of sampled ratio of input/output length, "
+        help="Range ratio for sampling input/output length, "
-        "used only for random sampling.",
+        "used only for random sampling. Must be in the range [0, 1) to define "
        "a symmetric sampling range"
        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
    )
    random_group.add_argument(
        "--random-prefix-len",
        type=int,
        default=0,
-        help="Number of fixed prefix tokens before random "
+        help=("Number of fixed prefix tokens before the random context "
-        " context. The length range of context in a random "
+              "in a request. "
-        " request is [random-prefix-len, "
+              "The total input length is the sum of `random-prefix-len` and "
-        " random-prefix-len + random-prefix-len * random-range-ratio).")
+              "a random "
              "context length sampled from [input_len * (1 - range_ratio), "
              "input_len * (1 + range_ratio)]."),
    )
    hf_group = parser.add_argument_group("hf dataset options")
    hf_group.add_argument("--hf-subset",
@@ -1301,6 +1031,33 @@ if __name__ == "__main__":
        "from the sampled HF dataset.",
    )
    sampling_group = parser.add_argument_group("sampling parameters")
    sampling_group.add_argument(
        "--top-p",
        type=float,
        default=None,
        help="Top-p sampling parameter. Only has effect on openai-compatible "
        "backends.")
    sampling_group.add_argument(
        "--top-k",
        type=int,
        default=None,
        help="Top-k sampling parameter. Only has effect on openai-compatible "
        "backends.")
    sampling_group.add_argument(
        "--min-p",
        type=float,
        default=None,
        help="Min-p sampling parameter. Only has effect on openai-compatible "
        "backends.")
    sampling_group.add_argument(
        "--temperature",
        type=float,
        default=None,
        help="Temperature sampling parameter. Only has effect on "
        "openai-compatible backends. If not specified, default to greedy "
        "decoding (i.e. temperature==0.0).")
    parser.add_argument(
        '--tokenizer-mode',
        type=str,
@@ -1327,4 +1084,5 @@ if __name__ == "__main__":
                        "script chooses a LoRA module at random.")
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -1,20 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
-r"""Benchmark online serving throughput with guided decoding.
+r"""Benchmark online serving throughput with structured outputs.
 On the server side, run one of the following commands:
    (vLLM OpenAI API server)
    vllm serve <your_model> --disable-log-requests
    (TGI backend)
    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
 On the client side, run:
-    python benchmarks/benchmark_serving.py \
+    python benchmarks/benchmark_serving_structured_output.py \
        --backend <backend> \
        --model <your_model> \
        --dataset json \
-        --guided-decoding-ratio 1.0 \
+        --structured-output-ratio 1.0 \
-        --guided-decoding-backend xgrammar \
+        --structured-output-backend auto \
        --request-rate 10 \
        --num-prompts 1000
@@ -24,14 +21,17 @@ On the client side, run:
 """
 import argparse
 import asyncio
 import copy
 import dataclasses
 import json
 import os
 import random
 import time
 import uuid
 import warnings
 from collections.abc import AsyncGenerator
 from dataclasses import dataclass
-from typing import AsyncGenerator, List, Optional, Tuple
+from typing import Optional
 import datasets
 import numpy as np
@@ -51,6 +51,9 @@ try:
 except ImportError:
    from argparse import ArgumentParser as FlexibleArgumentParser
 from vllm.v1.structured_output.utils import (
    has_xgrammar_unsupported_json_features)
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -66,22 +69,22 @@ class BenchmarkMetrics:
    mean_ttft_ms: float
    median_ttft_ms: float
    std_ttft_ms: float
-    percentiles_ttft_ms: List[Tuple[float, float]]
+    percentiles_ttft_ms: list[tuple[float, float]]
    mean_tpot_ms: float
    median_tpot_ms: float
    std_tpot_ms: float
-    percentiles_tpot_ms: List[Tuple[float, float]]
+    percentiles_tpot_ms: list[tuple[float, float]]
    mean_itl_ms: float
    median_itl_ms: float
    std_itl_ms: float
-    percentiles_itl_ms: List[Tuple[float, float]]
+    percentiles_itl_ms: list[tuple[float, float]]
    # E2EL stands for end-to-end latency per request.
    # It is the time taken on the client side from sending
    # a request to receiving a complete response.
    mean_e2el_ms: float
    median_e2el_ms: float
    std_e2el_ms: float
-    percentiles_e2el_ms: List[Tuple[float, float]]
+    percentiles_e2el_ms: list[tuple[float, float]]
@dataclasses.dataclass
@@ -104,25 +107,45 @@ class SampleRequest:
 def sample_requests(tokenizer: PreTrainedTokenizerBase,
-                    args: argparse.Namespace) -> List[SampleRequest]:
+                    args: argparse.Namespace) -> list[SampleRequest]:
-    if args.dataset == 'json':
+    if args.dataset == 'json' or args.dataset == 'json-unique':
        if args.json_schema_path is None:
            dir_path = os.path.dirname(os.path.realpath(__file__))
            args.json_schema_path = os.path.join(dir_path,
                                                 "structured_schemas",
                                                 "structured_schema_1.json")
        json_schemas = []
        with open(args.json_schema_path) as f:
            schema = json.load(f)
-        prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501
+
-        input_len = len(tokenizer(prompt).input_ids)
+        if args.dataset == 'json-unique':
-        print(f"Input length of the prompt: {input_len} tokens")
+            json_schemas = [
                copy.deepcopy(schema) for _ in range(args.num_prompts)
            ]
            for i in range(len(json_schemas)):
                json_schemas[i]["properties"][
                    f"__optional_field_{uuid.uuid4()}"] = {
                        "type":
                        "string",
                        "description":
                        "An unique optional field to avoid cached schemas"
                    }
        else:
            json_schemas = [schema] * args.num_prompts
        def gen_prompt(index: int):
            return f"Generate an example of a user profile given the following schema: {json.dumps(get_schema(index))}"  # noqa: E501
        def get_schema(index: int):
            return json_schemas[index % len(json_schemas)]
        requests = [
-            SampleRequest(prompt=prompt,
+            SampleRequest(prompt=gen_prompt(i),
-                          prompt_len=input_len,
+                          prompt_len=len(tokenizer(gen_prompt(i)).input_ids),
                          expected_output_len=args.output_len,
-                          schema=schema,
+                          schema=get_schema(i),
                          structure_type=args.structure_type)
-            for _ in range(args.num_prompts)
+            for i in range(args.num_prompts)
        ]
    elif args.dataset == "grammar":
@@ -187,10 +210,20 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
        ]
    elif args.dataset == "xgrammar_bench":
-        requests: List[SampleRequest] = []
+        requests: list[SampleRequest] = []
        dataset = datasets.load_dataset("NousResearch/json-mode-eval",
                                        split="train")
-        print(f"dataset has {len(dataset)} entries")
+        full_dataset_len = len(dataset)
        def _filter_func(item):
            import json
            schema = json.loads(item["schema"])
            return not has_xgrammar_unsupported_json_features(schema)
        dataset = dataset.filter(_filter_func)
        num_filtered_out = full_dataset_len - len(dataset)
        print(f"dataset has {len(dataset)} entries after filtering "
              f"out {num_filtered_out} entries with unsupported features")
        len_dataset = len(dataset)
        for data_point_idx in range(args.num_prompts):
            idx = data_point_idx
@@ -214,10 +247,10 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
 async def get_request(
-    input_requests: List[SampleRequest],
+    input_requests: list[SampleRequest],
    request_rate: float,
    burstiness: float = 1.0,
-) -> AsyncGenerator[Tuple[int, SampleRequest], None]:
+) -> AsyncGenerator[tuple[int, SampleRequest], None]:
    """
    Asynchronously generates requests at a specified rate
    with OPTIONAL burstiness.
@@ -258,22 +291,23 @@ async def get_request(
 def calculate_metrics(
-    input_requests: List[Tuple[str, int, int]],
+    input_requests: list[tuple[str, int, int]],
-    outputs: List[RequestFuncOutput],
+    outputs: list[RequestFuncOutput],
    dur_s: float,
    tokenizer: PreTrainedTokenizerBase,
-    selected_percentile_metrics: List[str],
+    selected_percentile_metrics: list[str],
-    selected_percentiles: List[float],
+    selected_percentiles: list[float],
-) -> Tuple[BenchmarkMetrics, List[int]]:
+    goodput_config_dict: Optional[dict[str, float]] = None,
-    actual_output_lens: List[int] = []
+) -> tuple[BenchmarkMetrics, list[int]]:
    actual_output_lens: list[int] = []
    total_input = 0
    completed = 0
    good_completed = 0
-    itls: List[float] = []
+    itls: list[float] = []
-    tpots: List[float] = []
+    tpots: list[float] = []
-    all_tpots: List[float] = []
+    all_tpots: list[float] = []
-    ttfts: List[float] = []
+    ttfts: list[float] = []
-    e2els: List[float] = []
+    e2els: list[float] = []
    for i in range(len(outputs)):
        if outputs[i].success:
            # We use the tokenizer to count the number of output tokens for all
@@ -287,10 +321,10 @@ def calculate_metrics(
            total_input += input_requests[i].prompt_len
            tpot = 0
            if output_len > 1:
-                tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
+                latency_minus_ttft = outputs[i].latency - outputs[i].ttft
-                                                                 1)
+                tpot = latency_minus_ttft / (output_len - 1)
                tpots.append(tpot)
-            outputs[i].tpot = sum(tpots) / len(tpots) if len(tpots) else 0
+            outputs[i].tpot = tpot
            # Note: if output_len <= 1, we regard tpot as 0 for goodput
            all_tpots.append(tpot)
            itls += outputs[i].itl
@@ -300,6 +334,28 @@ def calculate_metrics(
        else:
            actual_output_lens.append(0)
    if goodput_config_dict:
        valid_metrics = []
        slo_values = []
        if "ttft" in goodput_config_dict:
            valid_metrics.append(ttfts)
            slo_values.append(goodput_config_dict["ttft"] /
                              MILLISECONDS_TO_SECONDS_CONVERSION)
        if "tpot" in goodput_config_dict:
            valid_metrics.append(all_tpots)
            slo_values.append(goodput_config_dict["tpot"] /
                              MILLISECONDS_TO_SECONDS_CONVERSION)
        if "e2el" in goodput_config_dict:
            valid_metrics.append(e2els)
            slo_values.append(goodput_config_dict["e2el"] /
                              MILLISECONDS_TO_SECONDS_CONVERSION)
        for req_metric in zip(*valid_metrics):
            is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
            if is_good_req:
                good_completed += 1
    if completed == 0:
        warnings.warn(
            "All requests failed. This is likely due to a misconfiguration "
@@ -345,17 +401,18 @@ async def benchmark(
    base_url: str,
    model_id: str,
    tokenizer: PreTrainedTokenizerBase,
-    input_requests: List[SampleRequest],
+    input_requests: list[SampleRequest],
    request_rate: float,
    burstiness: float,
    disable_tqdm: bool,
    profile: bool,
-    selected_percentile_metrics: List[str],
+    selected_percentile_metrics: list[str],
-    selected_percentiles: List[str],
+    selected_percentiles: list[str],
    ignore_eos: bool,
    max_concurrency: Optional[int],
-    guided_decoding_ratio: float,
+    structured_output_ratio: float,
-    guided_decoding_backend: str,
+    structured_output_backend: str,
    goodput_config_dict: Optional[dict[str, float]] = None,
 ):
    if backend in ASYNC_REQUEST_FUNCS:
        request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -366,16 +423,18 @@ async def benchmark(
        extra_body = {}
        # Add the schema to the extra_body
        extra_body[request.structure_type] = request.schema
-        # Add the specific guided_decoding_backend
+        # Add the specific structured_output_backend
-        extra_body["guided_decoding_backend"] = guided_decoding_backend
+        extra_body["guided_decoding_backend"] = structured_output_backend
        return extra_body
    print("Starting initial single prompt test run...")
-    guided_decoding_req_idx = random.sample(
+    structured_output_req_idx = random.sample(
        range(len(input_requests)),
-        int(len(input_requests) * guided_decoding_ratio))
+        int(len(input_requests) * structured_output_ratio))
    test_request = input_requests[0]
    test_req_extra_body = (prepare_extra_body(test_request)
                           if 0 in structured_output_req_idx else None)
    test_input = RequestFuncInput(
        model=model_id,
        prompt=test_request.prompt,
@@ -383,7 +442,7 @@ async def benchmark(
        prompt_len=test_request.prompt_len,
        output_len=test_request.expected_output_len,
        ignore_eos=ignore_eos,
-        extra_body=prepare_extra_body(test_request),
+        extra_body=test_req_extra_body,
    )
    test_output = await request_func(request_func_input=test_input)
    if not test_output.success:
@@ -402,7 +461,7 @@ async def benchmark(
            prompt_len=test_request.prompt_len,
            output_len=test_request.expected_output_len,
            ignore_eos=ignore_eos,
-            extra_body=prepare_extra_body(test_request),
+            extra_body=test_req_extra_body,
        )
        profile_output = await request_func(request_func_input=profile_input)
        if profile_output.success:
@@ -435,12 +494,12 @@ async def benchmark(
                                      pbar=pbar)
    benchmark_start_time = time.perf_counter()
-    tasks: List[asyncio.Task] = []
+    tasks: list[asyncio.Task] = []
-    expected: List[str] = []
+    expected: list[str] = []
    async for i, request in get_request(input_requests, request_rate,
                                        burstiness):
        extra_body = prepare_extra_body(
-            request) if i in guided_decoding_req_idx else None
+            request) if i in structured_output_req_idx else None
        request_func_input = RequestFuncInput(
            model=model_id,
            prompt=request.prompt,
@@ -455,7 +514,7 @@ async def benchmark(
            asyncio.create_task(
                limited_request_func(request_func_input=request_func_input,
                                     pbar=pbar)))
-    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
+    outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
    if profile:
        print("Stopping profiler...")
@@ -483,6 +542,7 @@ async def benchmark(
        tokenizer=tokenizer,
        selected_percentile_metrics=selected_percentile_metrics,
        selected_percentiles=selected_percentiles,
        goodput_config_dict=goodput_config_dict,
    )
    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
@@ -494,6 +554,9 @@ async def benchmark(
                                 metrics.total_output))
    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
                                    metrics.request_throughput))
    if goodput_config_dict:
        print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
                                        metrics.request_goodput))
    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
                                    metrics.output_throughput))
    print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
@@ -617,6 +680,40 @@ def evaluate(ret, args):
            100) if len(not_none_scores) > 0 else None
 def parse_goodput(slo_pairs):
    goodput_config_dict = {}
    try:
        for slo_pair in slo_pairs:
            slo_name, slo_val = slo_pair.split(":")
            goodput_config_dict[slo_name] = float(slo_val)
    except ValueError as err:
        raise argparse.ArgumentTypeError(
            "Invalid format found for service level objectives. "
            "Specify service level objectives for goodput as \"KEY:VALUE\" "
            "pairs, where the key is a metric name, and the value is a "
            "number in milliseconds.") from err
    return goodput_config_dict
 def check_goodput_args(args):
    goodput_config_dict = {}
    VALID_NAMES = ["ttft", "tpot", "e2el"]
    if args.goodput:
        goodput_config_dict = parse_goodput(args.goodput)
        for slo_name, slo_val in goodput_config_dict.items():
            if slo_name not in VALID_NAMES:
                raise ValueError(
                    f"Invalid metric name found, {slo_name}: {slo_val}. "
                    "The service level objective name should be one of "
                    f"{str(VALID_NAMES)}. ")
            if slo_val < 0:
                raise ValueError(
                    f"Invalid value found, {slo_name}: {slo_val}. "
                    "The service level objective value should be "
                    "non-negative.")
    return goodput_config_dict
 def main(args: argparse.Namespace):
    print(args)
    random.seed(args.seed)
@@ -633,8 +730,11 @@ def main(args: argparse.Namespace):
        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
        base_url = f"http://{args.host}:{args.port}"
-    tokenizer = get_tokenizer(tokenizer_id,
+    tokenizer = get_tokenizer(
-                              trust_remote_code=args.trust_remote_code)
+        tokenizer_id,
        trust_remote_code=args.trust_remote_code,
        tokenizer_mode=args.tokenizer_mode,
    )
    if args.dataset == 'grammar':
        args.structure_type = 'guided_grammar'
@@ -645,10 +745,10 @@ def main(args: argparse.Namespace):
    else:
        args.structure_type = 'guided_json'
-    if args.no_guided_decoding:
+    if args.no_structured_output:
-        args.guided_decoding_ratio = 0
+        args.structured_output_ratio = 0
    if args.save_results:
-        result_file_name = f'{args.guided_decoding_ratio}guided'
+        result_file_name = f'{args.structured_output_ratio}guided'
        result_file_name += f"_{backend}"
        result_file_name += f"_{args.request_rate}qps"
        result_file_name += f"_{args.model.split('/')[-1]}"
@@ -661,6 +761,8 @@ def main(args: argparse.Namespace):
    input_requests = sample_requests(tokenizer, args)
    goodput_config_dict = check_goodput_args(args)
    benchmark_result, ret = asyncio.run(
        benchmark(
            backend=backend,
@@ -679,8 +781,9 @@ def main(args: argparse.Namespace):
            ],
            ignore_eos=args.ignore_eos,
            max_concurrency=args.max_concurrency,
-            guided_decoding_ratio=args.guided_decoding_ratio,
+            structured_output_ratio=args.structured_output_ratio,
-            guided_decoding_backend=args.guided_decoding_backend,
+            structured_output_backend=args.structured_output_backend,
            goodput_config_dict=goodput_config_dict,
        ))
    # Save config and results to json
@@ -740,10 +843,12 @@ if __name__ == "__main__":
        default="/v1/completions",
        help="API endpoint.",
    )
-    parser.add_argument(
+    parser.add_argument("--dataset",
-        "--dataset",
+                        default='json',
-        default='json',
+                        choices=[
-        choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
+                            'json', 'json-unique', 'grammar', 'regex',
                            'choice', 'xgrammar_bench'
                        ])
    parser.add_argument("--json_schema_path",
                        type=str,
                        default=None,
@@ -772,6 +877,13 @@ if __name__ == "__main__":
        help=
        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument(
        "--tokenizer-mode",
        type=str,
        default="auto",
        help=
        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument(
        "--num-prompts",
        type=int,
@@ -852,7 +964,7 @@ if __name__ == "__main__":
        "--percentile-metrics",
        type=str,
        default="ttft,tpot,itl",
-        help="Comma-seperated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentils. "
        "This argument specifies the metrics to report percentiles. "
        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
        "Default value is \"ttft,tpot,itl\".")
@@ -860,24 +972,39 @@ if __name__ == "__main__":
        "--metric-percentiles",
        type=str,
        default="99",
-        help="Comma-seperated list of percentiles for selected metrics. "
+        help="Comma-separated list of percentiles for selected metrics. "
        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
        "Default value is \"99\". "
        "Use \"--percentile-metrics\" to select metrics.",
    )
-    parser.add_argument("--no-guided-decoding",
+    parser.add_argument(
        "--goodput",
        nargs="+",
        required=False,
        help="Specify service level objectives for goodput as \"KEY:VALUE\" "
        "pairs, where the key is a metric name, and the value is in "
        "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
        "separated by spaces. Allowed request level metric names are "
        "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
        "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
        "and the blog: https://hao-ai-lab.github.io/blogs/distserve")
    parser.add_argument("--no-structured-output",
                        action='store_true',
                        default=False,
                        help="Whether to disable JSON decoding or not.")
-    parser.add_argument("--guided-decoding-ratio",
+    parser.add_argument("--structured-output-ratio",
                        type=float,
                        default=1.0,
-                        help="Ratio of Guided Decoding requests")
+                        help="Ratio of Structured Outputs requests")
-    parser.add_argument("--guided-decoding-backend",
+    parser.add_argument("--structured-output-backend",
                        type=str,
-                        choices=["outlines", "lm-format-enforcer", "xgrammar"],
+                        choices=[
-                        default="xgrammar",
+                            "outlines", "lm-format-enforcer", "xgrammar",
-                        help="Backend to use for guided decoding")
+                            "guidance", "auto"
                        ],
                        default="auto",
                        help="Backend to use for structured outputs")
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -6,13 +6,16 @@ import json
 import os
 import random
 import time
-from functools import cache
+import warnings
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Optional, Union
 import torch
 import uvloop
-from benchmark_utils import convert_to_pytorch_benchmark_format
+from benchmark_dataset import (AIMODataset, BurstGPTDataset,
-from PIL import Image
+                               ConversationDataset, InstructCoderDataset,
                               RandomDataset, SampleRequest, ShareGPTDataset,
                               SonnetDataset, VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          PreTrainedTokenizerBase)
@@ -20,163 +23,35 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer,
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.openai.api_server import (
    build_async_engine_client_from_engine_args)
-from vllm.inputs import TextPrompt
+from vllm.inputs import TextPrompt, TokensPrompt
 from vllm.lora.request import LoRARequest
-from vllm.lora.utils import get_adapter_absolute_path
+from vllm.outputs import RequestOutput
 from vllm.multimodal import MultiModalDataDict
 from vllm.sampling_params import BeamSearchParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
 from vllm.utils import FlexibleArgumentParser, merge_async_iterators
@dataclasses.dataclass
 class SampleRequest:
    """A class representing a single inference request for benchmarking.
    Attributes:
        prompt: The input text prompt for the model.
        prompt_len: The length of the prompt in tokens.
        expected_output_len: The expected length of the output in tokens.
        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
            images).
        lora_request: Optional LoRARequest specifying the LoRA to use. 
    """
    prompt: str
    prompt_len: int
    expected_output_len: int
    multi_modal_data: Optional[MultiModalDataDict] = None
    lora_request: Optional[LoRARequest] = None
 def _get_prompt_for_image_model(question: str, *, model: str) -> str:
    """Prepend and append special tokens around the question to form a prompt.
    Args:
        question: The input question text to wrap with special tokens
        model: The name of the model being used, to determine which special
            tokens to add
    Returns:
        The formatted prompt string with appropriate special tokens for the
            model
    Raises:
        ValueError: If an unsupported model name is provided
    """
    model = model.lower()
    if "pixtral" in model:
        return f"<s>[INST]{question}\n[IMG][/INST]"
    raise ValueError(f"Unsupported model {model}")
@cache
 def lora_path_on_disk(lora_path: str) -> str:
    return get_adapter_absolute_path(lora_path)
 lora_tokenizer_cache: Dict[int, AnyTokenizer] = {}
 def get_random_lora_request(
        args: argparse.Namespace
 ) -> Tuple[LoRARequest, Optional[AnyTokenizer]]:
    global lora_tokenizer_cache
    lora_id = random.randint(1, args.max_loras)
    lora_request = LoRARequest(lora_name=str(lora_id),
                               lora_int_id=lora_id,
                               lora_path=lora_path_on_disk(args.lora_path))
    if lora_id not in lora_tokenizer_cache:
        lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
    return lora_request, lora_tokenizer_cache[lora_id]
 def sample_requests(tokenizer: PreTrainedTokenizerBase,
                    args: argparse.Namespace) -> List[SampleRequest]:
    dataset_path: str = args.dataset
    num_requests: int = args.num_prompts
    fixed_output_len: Optional[int] = args.output_len
    model: str = args.model
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")
    # Load the dataset.
    with open(dataset_path) as f:
        dataset = json.load(f)
    # Filter out the conversations with less than 2 turns.
    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
    # Shuffle the dataset.
    random.shuffle(dataset)
    # Filter out sequences that are too long or too short
    filtered_dataset: List[SampleRequest] = []
    for data in tqdm(dataset,
                     total=len(filtered_dataset),
                     desc="sampling requests"):
        if len(filtered_dataset) == num_requests:
            break
        # Only keep the first two turns of each conversation.
        prompt = data["conversations"][0]["value"]
        completion = data["conversations"][1]["value"]
        multi_modal_data: Optional[MultiModalDataDict] = None
        if "image" in data:
            multi_modal_data = multi_modal_data or {}
            image_path = data["image"]
            # TODO(vllm-project/vllm/issues/9778): Support multiple images.
            assert isinstance(image_path,
                              str), "Only support single image input"
            try:
                multi_modal_data["image"] = Image.open(image_path).convert(
                    "RGB")
            except FileNotFoundError:
                # Ignore datapoint where asset is missing
                continue
            prompt = _get_prompt_for_image_model(question=prompt, model=model)
        request_tokenizer = tokenizer
        lora_request: Optional[LoRARequest] = None
        if args.enable_lora:
            lora_request, lora_tokenizer = get_random_lora_request(args)
            if lora_tokenizer:
                request_tokenizer = lora_tokenizer
        # Tokenize the prompts and completions.
        prompt_token_ids = request_tokenizer(prompt).input_ids
        completion_token_ids = request_tokenizer(completion).input_ids
        prompt_len = len(prompt_token_ids)
        output_len = len(completion_token_ids
                         ) if fixed_output_len is None else fixed_output_len
        if prompt_len < 4 or output_len < 4:
            # Prune too short sequences.
            continue
        if prompt_len > 1024 or prompt_len + output_len > 2048:
            # Prune too long sequences.
            continue
        filtered_dataset.append(
            SampleRequest(prompt=prompt,
                          prompt_len=prompt_len,
                          expected_output_len=output_len,
                          multi_modal_data=multi_modal_data,
                          lora_request=lora_request))
    return filtered_dataset
 def run_vllm(
-    requests: List[SampleRequest],
+    requests: list[SampleRequest],
    n: int,
    engine_args: EngineArgs,
-) -> float:
+    disable_detokenize: bool = False,
 ) -> tuple[float, Optional[list[RequestOutput]]]:
    from vllm import LLM, SamplingParams
    llm = LLM(**dataclasses.asdict(engine_args))
-
+    assert all(
        llm.llm_engine.model_config.max_model_len >= (
            request.prompt_len + request.expected_output_len)
        for request in requests), (
            "Please ensure that max_model_len is greater than the sum of"
            " prompt_len and expected_output_len for all requests.")
    # Add the requests to the engine.
-    prompts: List[TextPrompt] = []
+    prompts: list[Union[TextPrompt, TokensPrompt]] = []
-    sampling_params: List[SamplingParams] = []
+    sampling_params: list[SamplingParams] = []
    for request in requests:
        prompts.append(
            TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
                       multi_modal_data=request.multi_modal_data)
            if "prompt_token_ids" in request.prompt else \
            TextPrompt(prompt=request.prompt,
                       multi_modal_data=request.multi_modal_data))
        sampling_params.append(
@@ -186,19 +61,21 @@ def run_vllm(
                top_p=1.0,
                ignore_eos=True,
                max_tokens=request.expected_output_len,
                detokenize=not disable_detokenize,
            ))
-    lora_requests: Optional[List[LoRARequest]] = None
+    lora_requests: Optional[list[LoRARequest]] = None
    if engine_args.enable_lora:
        lora_requests = [request.lora_request for request in requests]
    use_beam_search = False
    outputs = None
    if not use_beam_search:
        start = time.perf_counter()
-        llm.generate(prompts,
+        outputs = llm.generate(prompts,
-                     sampling_params,
+                               sampling_params,
-                     lora_request=lora_requests,
+                               lora_request=lora_requests,
-                     use_tqdm=True)
+                               use_tqdm=True)
        end = time.perf_counter()
    else:
        assert lora_requests is None, "BeamSearch API does not support LoRA"
@@ -216,26 +93,75 @@ def run_vllm(
                ignore_eos=True,
            ))
        end = time.perf_counter()
-    return end - start
+    return end - start, outputs
 def run_vllm_chat(
        requests: list[SampleRequest],
        n: int,
        engine_args: EngineArgs,
        disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
    """
    Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
    multimodal models as it properly handles multimodal inputs and chat
    formatting. For non-multimodal models, use run_vllm() instead.
    """
    from vllm import LLM, SamplingParams
    llm = LLM(**dataclasses.asdict(engine_args))
    assert all(
        llm.llm_engine.model_config.max_model_len >= (
            request.prompt_len + request.expected_output_len)
        for request in requests), (
            "Please ensure that max_model_len is greater than the sum of "
            "prompt_len and expected_output_len for all requests.")
    prompts = []
    sampling_params: list[SamplingParams] = []
    for request in requests:
        prompts.append(request.prompt)
        sampling_params.append(
            SamplingParams(
                n=n,
                temperature=1.0,
                top_p=1.0,
                ignore_eos=True,
                max_tokens=request.expected_output_len,
                detokenize=not disable_detokenize,
            ))
    start = time.perf_counter()
    outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
    end = time.perf_counter()
    return end - start, outputs
 async def run_vllm_async(
-    requests: List[SampleRequest],
+    requests: list[SampleRequest],
    n: int,
    engine_args: AsyncEngineArgs,
    disable_frontend_multiprocessing: bool = False,
    disable_detokenize: bool = False,
 ) -> float:
    from vllm import SamplingParams
    async with build_async_engine_client_from_engine_args(
            engine_args, disable_frontend_multiprocessing) as llm:
        assert all(
            llm.model_config.max_model_len >= (request.prompt_len +
                                               request.expected_output_len)
            for request in requests), (
                "Please ensure that max_model_len is greater than the sum of"
                " prompt_len and expected_output_len for all requests.")
        # Add the requests to the engine.
-        prompts: List[TextPrompt] = []
+        prompts: list[Union[TextPrompt, TokensPrompt]] = []
-        sampling_params: List[SamplingParams] = []
+        sampling_params: list[SamplingParams] = []
-        lora_requests: List[Optional[LoRARequest]] = []
+        lora_requests: list[Optional[LoRARequest]] = []
        for request in requests:
            prompts.append(
                TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
                        multi_modal_data=request.multi_modal_data)
                if "prompt_token_ids" in request.prompt else \
                TextPrompt(prompt=request.prompt,
                           multi_modal_data=request.multi_modal_data))
            sampling_params.append(
@@ -245,6 +171,7 @@ async def run_vllm_async(
                    top_p=1.0,
                    ignore_eos=True,
                    max_tokens=request.expected_output_len,
                    detokenize=not disable_detokenize,
                ))
            lora_requests.append(request.lora_request)
@@ -265,12 +192,13 @@ async def run_vllm_async(
 def run_hf(
-    requests: List[SampleRequest],
+    requests: list[SampleRequest],
    model: str,
    tokenizer: PreTrainedTokenizerBase,
    n: int,
    max_batch_size: int,
    trust_remote_code: bool,
    disable_detokenize: bool = False,
 ) -> float:
    llm = AutoModelForCausalLM.from_pretrained(
        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
@@ -281,18 +209,21 @@ def run_hf(
    pbar = tqdm(total=len(requests))
    start = time.perf_counter()
-    batch: List[str] = []
+    batch: list[str] = []
    max_prompt_len = 0
    max_output_len = 0
    for i in range(len(requests)):
-        prompt, prompt_len, output_len = requests[i]
+        prompt = requests[i].prompt
        prompt_len = requests[i].prompt_len
        output_len = requests[i].expected_output_len
        # Add the prompt to the batch.
        batch.append(prompt)
        max_prompt_len = max(max_prompt_len, prompt_len)
        max_output_len = max(max_output_len, output_len)
        if len(batch) < max_batch_size and i != len(requests) - 1:
            # Check if we can add more requests to the batch.
-            _, next_prompt_len, next_output_len = requests[i + 1]
+            next_prompt_len = requests[i + 1].prompt_len
            next_output_len = requests[i + 1].expected_output_len
            if (max(max_prompt_len, next_prompt_len) +
                    max(max_output_len, next_output_len)) <= 2048:
                # We can add more requests to the batch.
@@ -310,8 +241,9 @@ def run_hf(
            use_cache=True,
            max_new_tokens=max_output_len,
        )
-        # Include the decoding time.
+        if not disable_detokenize:
-        tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
+            # Include the decoding time.
            tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
        pbar.update(len(batch))
        # Clear the batch.
@@ -323,7 +255,7 @@ def run_hf(
 def run_mii(
-    requests: List[SampleRequest],
+    requests: list[SampleRequest],
    model: str,
    tensor_parallel_size: int,
    output_len: int,
@@ -341,7 +273,7 @@ def run_mii(
 def save_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                     results: Dict[str, Any]) -> None:
+                                     results: dict[str, Any]) -> None:
    pt_records = convert_to_pytorch_benchmark_format(
        args=args,
        metrics={
@@ -355,62 +287,77 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
    if pt_records:
        # Don't use json suffix here as we don't want CI to pick it up
        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
-        with open(pt_file, "w") as f:
+        write_to_json(pt_file, pt_records)
-            json.dump(pt_records, f)
+
 def get_requests(args, tokenizer):
    # Common parameters for all dataset types.
    common_kwargs = {
        "dataset_path": args.dataset_path,
        "random_seed": args.seed,
    }
    sample_kwargs = {
        "tokenizer": tokenizer,
        "lora_path": args.lora_path,
        "max_loras": args.max_loras,
        "num_requests": args.num_prompts,
        "input_len": args.input_len,
        "output_len": args.output_len,
    }
    if args.dataset_path is None or args.dataset_name == "random":
        sample_kwargs["range_ratio"] = args.random_range_ratio
        sample_kwargs["prefix_len"] = args.prefix_len
        dataset_cls = RandomDataset
    elif args.dataset_name == "sharegpt":
        dataset_cls = ShareGPTDataset
        if args.backend == "vllm-chat":
            sample_kwargs["enable_multimodal_chat"] = True
    elif args.dataset_name == "sonnet":
        assert tokenizer.chat_template or tokenizer.default_chat_template, (
            "Tokenizer/model must have chat template for sonnet dataset.")
        dataset_cls = SonnetDataset
        sample_kwargs["prefix_len"] = args.prefix_len
        sample_kwargs["return_prompt_formatted"] = True
    elif args.dataset_name == "burstgpt":
        dataset_cls = BurstGPTDataset
    elif args.dataset_name == "hf":
        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
            dataset_cls = VisionArenaDataset
            common_kwargs['dataset_subset'] = None
            common_kwargs['dataset_split'] = "train"
            sample_kwargs["enable_multimodal_chat"] = True
        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
            dataset_cls = InstructCoderDataset
            common_kwargs['dataset_split'] = "train"
        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
            dataset_cls = ConversationDataset
            common_kwargs['dataset_subset'] = args.hf_subset
            common_kwargs['dataset_split'] = args.hf_split
            sample_kwargs["enable_multimodal_chat"] = True
        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
            dataset_cls = AIMODataset
            common_kwargs['dataset_subset'] = None
            common_kwargs['dataset_split'] = "train"
    else:
        raise ValueError(f"Unknown dataset name: {args.dataset_name}")
    # Remove None values
    sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
    return dataset_cls(**common_kwargs).sample(**sample_kwargs)
 def main(args: argparse.Namespace):
    if args.seed is None:
        args.seed = 0
    print(args)
    random.seed(args.seed)
    # Sample the requests.
    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer, trust_remote_code=args.trust_remote_code)
-    if args.dataset is None:
+    requests = get_requests(args, tokenizer)
        vocab_size = tokenizer.vocab_size
        requests = []
        for _ in range(args.num_prompts):
            request_tokenizer = tokenizer
            lora_request: Optional[LoRARequest] = None
            if args.enable_lora:
                lora_request, lora_tokenizer = get_random_lora_request(args)
                if lora_tokenizer:
                    request_tokenizer = lora_tokenizer
            # Synthesize a prompt with the given input length.
            candidate_ids = [
                random.randint(0, vocab_size - 1)
                for _ in range(args.input_len)
            ]
            # As tokenizer may add additional tokens like BOS, we need to try
            # different lengths to get the desired input length.
            for _ in range(5):  # Max attempts to correct
                candidate_prompt = request_tokenizer.decode(candidate_ids)
                tokenized_len = len(request_tokenizer.encode(candidate_prompt))
                if tokenized_len == args.input_len:
                    break
                # Adjust length based on difference
                diff = args.input_len - tokenized_len
                if diff > 0:
                    candidate_ids.extend([
                        random.randint(100, vocab_size - 100)
                        for _ in range(diff)
                    ])
                else:
                    candidate_ids = candidate_ids[:diff]
            requests.append(
                SampleRequest(prompt=candidate_prompt,
                              prompt_len=args.input_len,
                              expected_output_len=args.output_len,
                              lora_request=lora_request))
    else:
        requests = sample_requests(tokenizer, args)
    is_multi_modal = any(request.multi_modal_data is not None
                         for request in requests)
    request_outputs: Optional[list[RequestOutput]] = None
    if args.backend == "vllm":
        if args.async_engine:
            elapsed_time = uvloop.run(
@@ -419,31 +366,59 @@ def main(args: argparse.Namespace):
                    args.n,
                    AsyncEngineArgs.from_cli_args(args),
                    args.disable_frontend_multiprocessing,
                    args.disable_detokenize,
                ))
        else:
-            elapsed_time = run_vllm(requests, args.n,
+            elapsed_time, request_outputs = run_vllm(
-                                    EngineArgs.from_cli_args(args))
+                requests, args.n, EngineArgs.from_cli_args(args),
                args.disable_detokenize)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
-                              args.hf_max_batch_size, args.trust_remote_code)
+                              args.hf_max_batch_size, args.trust_remote_code,
                              args.disable_detokenize)
    elif args.backend == "mii":
        elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
                               args.output_len)
    elif args.backend == "vllm-chat":
        elapsed_time, request_outputs = run_vllm_chat(
            requests, args.n, EngineArgs.from_cli_args(args),
            args.disable_detokenize)
    else:
        raise ValueError(f"Unknown backend: {args.backend}")
-    total_num_tokens = sum(request.prompt_len + request.expected_output_len
+
-                           for request in requests)
+    if request_outputs:
-    total_output_tokens = sum(request.expected_output_len
+        # Note: with the vllm and vllm-chat backends,
-                              for request in requests)
+        # we have request_outputs, which we use to count tokens.
-    if is_multi_modal:
+        total_prompt_tokens = 0
-        print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
+        total_output_tokens = 0
        for ro in request_outputs:
            if not isinstance(ro, RequestOutput):
                continue
            total_prompt_tokens += len(
                ro.prompt_token_ids) if ro.prompt_token_ids else 0
            total_output_tokens += sum(
                len(o.token_ids) for o in ro.outputs if o)
        total_num_tokens = total_prompt_tokens + total_output_tokens
    else:
        total_num_tokens = sum(r.prompt_len + r.expected_output_len
                               for r in requests)
        total_output_tokens = sum(r.expected_output_len for r in requests)
        total_prompt_tokens = total_num_tokens - total_output_tokens
    if is_multi_modal and args.backend != "vllm-chat":
        print("\033[91mWARNING\033[0m: Multi-modal request with "
              f"{args.backend} backend detected. The "
              "following metrics are not accurate because image tokens are not"
              " counted. See vllm-project/vllm/issues/9778 for details.")
-        # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
+        # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
        # vllm-chat backend counts the image tokens now
    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
          f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
          f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
    print(f"Total num prompt tokens:  {total_prompt_tokens}")
    print(f"Total num output tokens:  {total_output_tokens}")
    # Output JSON results if specified
    if args.output_json:
@@ -459,18 +434,120 @@ def main(args: argparse.Namespace):
        save_to_pytorch_benchmark_format(args, results)
 def validate_args(args):
    """
    Validate command-line arguments.
    """
    # === Deprecation and Defaulting ===
    if args.dataset is not None:
        warnings.warn(
            "The '--dataset' argument will be deprecated in the next release. "
            "Please use '--dataset-name' and '--dataset-path' instead.",
            stacklevel=2)
        args.dataset_path = args.dataset
    if not getattr(args, "tokenizer", None):
        args.tokenizer = args.model
    # === Backend Validation ===
    valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
    if args.backend not in valid_backends:
        raise ValueError(f"Unsupported backend: {args.backend}")
    # === Dataset Configuration ===
    if not args.dataset and not args.dataset_path:
        print(
            "When dataset path is not set, it will default to random dataset")
        args.dataset_name = 'random'
        if args.input_len is None:
            raise ValueError("input_len must be provided for a random dataset")
    # === Dataset Name Specific Checks ===
    # --hf-subset and --hf-split: only used
    # when dataset_name is 'hf'
    if args.dataset_name != "hf" and (
            getattr(args, "hf_subset", None) is not None
            or getattr(args, "hf_split", None) is not None):
        warnings.warn("--hf-subset and --hf-split will be ignored \
                since --dataset-name is not 'hf'.",
                      stacklevel=2)
    elif args.dataset_name == "hf":
        if args.dataset_path in (
                VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
                | ConversationDataset.SUPPORTED_DATASET_PATHS):
            assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend."  #noqa: E501
        elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS
                                   | AIMODataset.SUPPORTED_DATASET_PATHS):
            assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend."  #noqa: E501
        else:
            raise ValueError(
                f"{args.dataset_path} is not supported by hf dataset.")
    # --random-range-ratio: only used when dataset_name is 'random'
    if args.dataset_name != 'random' and args.random_range_ratio is not None:
        warnings.warn("--random-range-ratio will be ignored since \
                --dataset-name is not 'random'.",
                      stacklevel=2)
    # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
    # set.
    if args.dataset_name not in {"random", "sonnet", None
                                 } and args.prefix_len is not None:
        warnings.warn("--prefix-len will be ignored since --dataset-name\
                 is not 'random', 'sonnet', or not set.",
                      stacklevel=2)
    # === LoRA Settings ===
    if getattr(args, "enable_lora", False) and args.backend != "vllm":
        raise ValueError(
            "LoRA benchmarking is only supported for vLLM backend")
    if getattr(args, "enable_lora", False) and args.lora_path is None:
        raise ValueError("LoRA path must be provided when enable_lora is True")
    # === Backend-specific Validations ===
    if args.backend == "hf" and args.hf_max_batch_size is None:
        raise ValueError("HF max batch size is required for HF backend")
    if args.backend != "hf" and args.hf_max_batch_size is not None:
        raise ValueError("HF max batch size is only for HF backend.")
    if args.backend in {"hf", "mii"} and getattr(args, "quantization",
                                                 None) is not None:
        raise ValueError("Quantization is only for vLLM backend.")
    if args.backend == "mii" and args.dtype != "auto":
        raise ValueError("dtype must be auto for MII backend.")
    if args.backend == "mii" and args.n != 1:
        raise ValueError("n must be 1 for MII backend.")
    if args.backend == "mii" and args.tokenizer != args.model:
        raise ValueError(
            "Tokenizer must be the same as the model for MII backend.")
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
    parser.add_argument("--backend",
                        type=str,
-                        choices=["vllm", "hf", "mii"],
+                        choices=["vllm", "hf", "mii", "vllm-chat"],
                        default="vllm")
-    parser.add_argument("--dataset",
+    parser.add_argument(
        "--dataset-name",
        type=str,
        choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
        help="Name of the dataset to benchmark on.",
        default="sharegpt")
    parser.add_argument(
        "--dataset",
        type=str,
        default=None,
        help="Path to the ShareGPT dataset, will be deprecated in\
            the next release. The dataset is expected to "
        "be a json in form of list[dict[..., conversations: "
        "list[dict[..., value: <prompt_or_response>]]]]")
    parser.add_argument("--dataset-path",
                        type=str,
                        default=None,
-                        help="Path to the dataset. The dataset is expected to "
+                        help="Path to the dataset")
                        "be a json in form of List[Dict[..., conversations: "
                        "List[Dict[..., value: <prompt_or_response>]]]]")
    parser.add_argument("--input-len",
                        type=int,
                        default=None,
@@ -505,6 +582,11 @@ if __name__ == "__main__":
                        action='store_true',
                        default=False,
                        help="Disable decoupled async engine frontend.")
    parser.add_argument(
        "--disable-detokenize",
        action="store_true",
        help=("Do not detokenize the response (i.e. do not include "
              "detokenization time in the measurement)"))
    # LoRA
    parser.add_argument(
        "--lora-path",
@@ -512,43 +594,45 @@ if __name__ == "__main__":
        default=None,
        help="Path to the lora adapters to use. This can be an absolute path, "
        "a relative path, or a Hugging Face model identifier.")
    parser.add_argument(
        "--prefix-len",
        type=int,
        default=None,
        help=f"Number of prefix tokens to be used in RandomDataset "
        "and SonnetDataset. For RandomDataset, the total input "
        "length is the sum of prefix-len (default: "
        f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
        "sampled from [input_len * (1 - range_ratio), "
        "input_len * (1 + range_ratio)]. For SonnetDataset, "
        f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
        "controls how much of the input is fixed lines versus "
        "random lines, but the total input length remains approximately "
        "input_len tokens.")
    # random dataset
    parser.add_argument(
        "--random-range-ratio",
        type=float,
        default=None,
        help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) "
        "for sampling input/output length, "
        "used only for RandomDataset. Must be in the range [0, 1) to "
        "define a symmetric sampling range "
        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
    )
    # hf dtaset
    parser.add_argument("--hf-subset",
                        type=str,
                        default=None,
                        help="Subset of the HF dataset.")
    parser.add_argument("--hf-split",
                        type=str,
                        default=None,
                        help="Split of the HF dataset.")
    parser = AsyncEngineArgs.add_cli_args(parser)
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model
-    if args.dataset is None:
+    validate_args(args)
        assert args.input_len is not None
        assert args.output_len is not None
    else:
        assert args.input_len is None
    if args.enable_lora:
        assert args.lora_path is not None
    if args.backend == "vllm":
        if args.hf_max_batch_size is not None:
            raise ValueError("HF max batch size is only for HF backend.")
    elif args.backend == "hf":
        if args.hf_max_batch_size is None:
            raise ValueError("HF max batch size is required for HF backend.")
        if args.quantization is not None:
            raise ValueError("Quantization is only for vLLM backend.")
        if args.enable_lora is not None:
            raise ValueError("LoRA benchmarking is only supported for vLLM"
                             " backend")
    elif args.backend == "mii":
        if args.dtype != "auto":
            raise ValueError("dtype must be auto for MII backend.")
        if args.n != 1:
            raise ValueError("n must be 1 for MII backend.")
        if args.quantization is not None:
            raise ValueError("Quantization is only for vLLM backend.")
        if args.hf_max_batch_size is not None:
            raise ValueError("HF max batch size is only for HF backend.")
        if args.tokenizer != args.model:
            raise ValueError("Tokenizer must be the same as the model for MII "
                             "backend.")
        if args.enable_lora is not None:
            raise ValueError("LoRA benchmarking is only supported for vLLM"
                             " backend")
    main(args)
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -1,13 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 import argparse
 import json
 import math
 import os
-from typing import Any, Dict, List
+from typing import Any
 def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                        metrics: Dict[str, List],
+                                        metrics: dict[str, list],
-                                        extra_info: Dict[str, Any]) -> List:
+                                        extra_info: dict[str, Any]) -> list:
    """
    Save the benchmark results in the format used by PyTorch OSS benchmark with
    on metric per record
@@ -34,6 +36,34 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
                "extra_info": extra_info,
            },
        }
        tp = record["benchmark"]["extra_info"]["args"].get(
            "tensor_parallel_size")
        # Save tensor_parallel_size parameter if it's part of the metadata
        if not tp and "tensor_parallel_size" in extra_info:
            record["benchmark"]["extra_info"]["args"][
                "tensor_parallel_size"] = extra_info["tensor_parallel_size"]
        records.append(record)
    return records
 class InfEncoder(json.JSONEncoder):
    def clear_inf(self, o: Any):
        if isinstance(o, dict):
            return {k: self.clear_inf(v) for k, v in o.items()}
        elif isinstance(o, list):
            return [self.clear_inf(v) for v in o]
        elif isinstance(o, float) and math.isinf(o):
            return "inf"
        return o
    def iterencode(self, o: Any, *args, **kwargs) -> Any:
        return super().iterencode(self.clear_inf(o), *args, **kwargs)
 def write_to_json(filename: str, records: list) -> None:
    with open(filename, "w") as f:
        json.dump(records, f, cls=InfEncoder)
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -5,7 +5,8 @@ import copy
 import itertools
 import pickle as pkl
 import time
-from typing import Callable, Iterable, List, Tuple
+from collections.abc import Iterable
 from typing import Callable
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -228,7 +229,7 @@ def print_timers(timers: Iterable[TMeasurement]):
 def run(dtype: torch.dtype,
-        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+        MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]:
    results = []
    for m, k, n in MKNs:
        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
@@ -241,7 +242,7 @@ def run(dtype: torch.dtype,
 # output makers
 def make_output(data: Iterable[TMeasurement],
-                MKNs: Iterable[Tuple[int, int, int]],
+                MKNs: Iterable[tuple[int, int, int]],
                base_description: str,
                timestamp=None):
    print(f"== All Results {base_description} ====")
@@ -282,7 +283,7 @@ def run_model_bench(args):
    for i, model in enumerate(args.models):
        print(f"[{i}]  {model}")
-    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+    def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]:
        KNs = []
        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
--- a/benchmarks/cutlass_benchmarks/utils.py
+++ b/benchmarks/cutlass_benchmarks/utils.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # Cutlass bench utils
-from typing import Iterable, Tuple
+from collections.abc import Iterable
 import torch
@@ -27,7 +27,7 @@ def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
 def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
-                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+                      k: int) -> tuple[torch.Tensor, torch.Tensor]:
    a = torch.randn((m, k), device='cuda') * 5
    b = torch.randn((n, k), device='cuda').t() * 5
@@ -63,7 +63,7 @@ def prune_to_2_4(tensor):
 def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
-                             k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+                             k: int) -> tuple[torch.Tensor, torch.Tensor]:
    a = torch.randn((m, k), device='cuda') * 5
    b = torch.randn((n, k), device='cuda').t() * 5
@@ -88,7 +88,7 @@ def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
 def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype,
                        m: int, n: int, k: int) -> \
-                        Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
+                        tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
    ABs = []
    for _ in range(num_tensors):
        b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -5,7 +5,8 @@ import copy
 import itertools
 import pickle as pkl
 import time
-from typing import Callable, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
 from typing import Callable, Optional
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -49,7 +50,7 @@ def bench_int8(
        n: int,
        label: str,
        sub_label: str,
-        bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
+        bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
    """Benchmark INT8-based kernels."""
    assert dtype == torch.int8
    a, b = make_rand_tensors(torch.int8, m, n, k)
@@ -101,7 +102,7 @@ def bench_fp8(
        n: int,
        label: str,
        sub_label: str,
-        bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
+        bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
    """Benchmark FP8-based kernels."""
    assert dtype == torch.float8_e4m3fn
    a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
@@ -180,7 +181,7 @@ def bench(dtype: torch.dtype,
          n: int,
          label: str,
          sub_label: str,
-          bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
+          bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
    if dtype == torch.int8:
        return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
    if dtype == torch.float8_e4m3fn:
@@ -195,8 +196,8 @@ def print_timers(timers: Iterable[TMeasurement]):
 def run(dtype: torch.dtype,
-        MKNs: Iterable[Tuple[int, int, int]],
+        MKNs: Iterable[tuple[int, int, int]],
-        bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
+        bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
    results = []
    for m, k, n in MKNs:
        timers = bench(dtype,
@@ -212,7 +213,7 @@ def run(dtype: torch.dtype,
 def make_output(data: Iterable[TMeasurement],
-                MKNs: Iterable[Tuple[int, int, int]],
+                MKNs: Iterable[tuple[int, int, int]],
                base_description: str,
                timestamp=None):
    print(f"== All Results {base_description} ====")
@@ -248,7 +249,7 @@ def run_model_bench(args):
    for i, model in enumerate(args.models):
        print(f"[{i}]  {model}")
-    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+    def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]:
        KNs = []
        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -2,9 +2,10 @@
 import pickle as pkl
 import time
 from collections.abc import Iterable
 from dataclasses import dataclass
 from itertools import product
-from typing import Callable, Iterable, List, Optional
+from typing import Callable, Optional
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -29,7 +30,7 @@ class bench_params_t:
                f'x DT {self.dtype}')
-def get_bench_params() -> List[bench_params_t]:
+def get_bench_params() -> list[bench_params_t]:
    ## Test Fixtures
    NUM_TOKENS = [2**x for x in range(11)]
    HIDDEN_SIZES = list(range(1024, 8129, 1024))
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -0,0 +1,340 @@
 # SPDX-License-Identifier: Apache-2.0
 import torch
 import torch.utils.benchmark as benchmark
 from benchmark_shapes import WEIGHT_SHAPES_MOE
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.fused_moe import (cutlass_moe_fp8,
                                                            fused_experts,
                                                            fused_topk)
 from vllm.utils import FlexibleArgumentParser
 DEFAULT_MODELS = [
    "nm-testing/Mixtral-8x7B-Instruct-v0.1", "nm-testing/deepseekv2-lite",
    "ibm-granite/granite-3.0-1b-a400m", "ibm-granite/granite-3.0-3b-a800m"
 ]
 DEFAULT_BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128, 256, 512]
 DEFAULT_TP_SIZES = [1]
 PER_ACT_TOKEN_OPTS = [False]
 PER_OUT_CH_OPTS = [False]
 def to_fp8(tensor: torch.Tensor):
    finfo = torch.finfo(torch.float8_e4m3fn)
    return torch.round(tensor.clamp(
        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
 def bench_run(results: list[benchmark.Measurement], model: str,
              num_experts: int, topk: int, per_act_token: bool,
              per_out_ch: bool, mkn: tuple[int, int, int]):
    label = "Quant Matmul"
    sub_label = (
        "{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, "
        "MKN=({})".format(model, num_experts, topk, per_act_token, per_out_ch,
                          mkn))
    print(f"Testing: {sub_label}")
    (m, k, n) = mkn
    dtype = torch.half
    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
    w1 = torch.randn((num_experts, 2 * n, k), device="cuda", dtype=dtype) / 10
    w2 = torch.randn((num_experts, k, n), device="cuda", dtype=dtype) / 10
    _, a_scale = ops.scaled_fp8_quant(a)
    w1_q = torch.empty((num_experts, 2 * n, k),
                       device="cuda",
                       dtype=torch.float8_e4m3fn)
    w2_q = torch.empty((num_experts, k, n),
                       device="cuda",
                       dtype=torch.float8_e4m3fn)
    w1_scale = torch.empty((num_experts, 1, 1),
                           device="cuda",
                           dtype=torch.float32)
    w2_scale = torch.empty((num_experts, 1, 1),
                           device="cuda",
                           dtype=torch.float32)
    ab_strides1 = torch.full((num_experts, ),
                             k,
                             device="cuda",
                             dtype=torch.int64)
    c_strides1 = torch.full((num_experts, ),
                            2 * n,
                            device="cuda",
                            dtype=torch.int64)
    ab_strides2 = torch.full((num_experts, ),
                             n,
                             device="cuda",
                             dtype=torch.int64)
    c_strides2 = torch.full((num_experts, ),
                            k,
                            device="cuda",
                            dtype=torch.int64)
    for expert in range(num_experts):
        w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
        w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
    w1_q_notransp = w1_q.clone()
    w2_q_notransp = w2_q.clone()
    w1_q = w1_q.transpose(1, 2)
    w2_q = w2_q.transpose(1, 2)
    score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
    topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
    def run_triton_moe(a: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
                       topk_weights: torch.Tensor, topk_ids: torch.Tensor,
                       w1_scale: torch.Tensor, w2_scale: torch.Tensor,
                       a_scale: torch.Tensor, num_repeats: int):
        for _ in range(num_repeats):
            fused_experts(a,
                          w1,
                          w2,
                          topk_weights,
                          topk_ids,
                          use_fp8_w8a8=True,
                          w1_scale=w1_scale,
                          w2_scale=w2_scale,
                          a1_scale=a_scale)
    def run_cutlass_moe(a: torch.Tensor, a_scale: torch.Tensor,
                        w1: torch.Tensor, w2: torch.Tensor,
                        w1_scale: torch.Tensor, w2_scale: torch.Tensor,
                        topk_weights: torch.Tensor, topk_ids: torch.Tensor,
                        ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
                        ab_strides2: torch.Tensor, c_strides2: torch.Tensor,
                        num_repeats: int):
        for _ in range(num_repeats):
            cutlass_moe_fp8(a,
                            w1,
                            w2,
                            w1_scale,
                            w2_scale,
                            topk_weights,
                            topk_ids,
                            ab_strides1,
                            c_strides1,
                            ab_strides2,
                            c_strides2,
                            a1_scale=a_scale)
    def run_cutlass_from_graph(
            a: torch.Tensor, a_scale: torch.Tensor, w1_q: torch.Tensor,
            w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor,
            topk_weights: torch.Tensor, topk_ids: torch.Tensor,
            ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
            ab_strides2: torch.Tensor, c_strides2: torch.Tensor):
        with set_current_vllm_config(
                VllmConfig(parallel_config=ParallelConfig(
                    pipeline_parallel_size=1))):
            return cutlass_moe_fp8(a,
                                   w1_q,
                                   w2_q,
                                   w1_scale,
                                   w2_scale,
                                   topk_weights,
                                   topk_ids,
                                   ab_strides1,
                                   c_strides1,
                                   ab_strides2,
                                   c_strides2,
                                   a1_scale=a_scale)
    def run_triton_from_graph(a: torch.Tensor, w1: torch.Tensor,
                              w2: torch.Tensor, topk_weights: torch.Tensor,
                              topk_ids: torch.Tensor, w1_scale: torch.Tensor,
                              w2_scale: torch.Tensor, a_scale: torch.Tensor):
        with set_current_vllm_config(
                VllmConfig(parallel_config=ParallelConfig(
                    pipeline_parallel_size=1))):
            return fused_experts(a,
                                 w1,
                                 w2,
                                 topk_weights,
                                 topk_ids,
                                 use_fp8_w8a8=True,
                                 w1_scale=w1_scale,
                                 w2_scale=w2_scale,
                                 a1_scale=a_scale)
    def replay_graph(graph, num_repeats):
        for _ in range(num_repeats):
            graph.replay()
        torch.cuda.synchronize()
    cutlass_stream = torch.cuda.Stream()
    cutlass_graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
        run_cutlass_from_graph(a, a_scale, w1_q, w2_q, w1_scale, w2_scale,
                               topk_weights, topk_ids, ab_strides1, c_strides1,
                               ab_strides2, c_strides2)
    torch.cuda.synchronize()
    triton_stream = torch.cuda.Stream()
    triton_graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(triton_graph, stream=triton_stream):
        run_triton_from_graph(a, w1_q_notransp, w2_q_notransp, topk_weights,
                              topk_ids, w1_scale, w2_scale, a_scale)
    torch.cuda.synchronize()
    min_run_time = 5
    num_warmup = 5
    num_runs = 25
    globals = {
        # Baseline params
        "w1": w1,
        "w2": w2,
        "score": score,
        "topk": topk,
        "w1_q_notransp": w1_q_notransp,
        "w2_q_notransp": w2_q_notransp,
        # Cutlass params
        "a_scale": a_scale,
        "w1_q": w1_q,
        "w2_q": w2_q,
        "w1_scale": w1_scale,
        "w2_scale": w2_scale,
        "ab_strides1": ab_strides1,
        "c_strides1": c_strides1,
        "ab_strides2": ab_strides2,
        "c_strides2": c_strides2,
        # cuda graph params
        "cutlass_graph": cutlass_graph,
        "triton_graph": triton_graph,
        # Gen params
        "a": a,
        "topk_weights": topk_weights,
        "topk_ids": topk_ids,
        "num_runs": num_runs,
        # Kernels
        "run_triton_moe": run_triton_moe,
        "run_cutlass_moe": run_cutlass_moe,
        "replay_graph": replay_graph,
    }
    # Warmup
    run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids,
                   w1_scale, w2_scale, a_scale, num_warmup)
    results.append(
        benchmark.Timer(
            stmt=
            "run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
            description="triton_moe",
        ).blocked_autorange(min_run_time=min_run_time))
    # Warmup
    replay_graph(triton_graph, num_warmup)
    results.append(
        benchmark.Timer(
            stmt="replay_graph(triton_graph, num_runs)",
            globals=globals,
            label=label,
            sub_label=sub_label,
            description="triton_moe_cuda_graphs",
        ).blocked_autorange(min_run_time=min_run_time))
    # Warmup
    run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights,
                    topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2,
                    num_warmup)
    results.append(
        benchmark.Timer(
            stmt=
            "run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
            description="grouped_gemm_moe",
        ).blocked_autorange(min_run_time=min_run_time))
    # Warmup
    replay_graph(cutlass_graph, num_warmup)
    results.append(
        benchmark.Timer(
            stmt="replay_graph(cutlass_graph, num_runs)",
            globals=globals,
            label=label,
            sub_label=sub_label,
            description="grouped_gemm_moe_cuda_graphs",
        ).blocked_autorange(min_run_time=min_run_time))
 def main(args):
    print("Benchmarking models:")
    for i, model in enumerate(args.models):
        print(f"[{i}]  {model}")
    results: list[benchmark.Measurement] = []
    for model in args.models:
        for tp in args.tp_sizes:
            for layer in WEIGHT_SHAPES_MOE[model]:
                num_experts = layer[0]
                topk = layer[1]
                size_k = layer[2]
                size_n = layer[3] // tp
                if len(args.limit_k) > 0 and size_k not in args.limit_k:
                    continue
                if len(args.limit_n) > 0 and size_n not in args.limit_n:
                    continue
                for per_act_token in PER_ACT_TOKEN_OPTS:
                    for per_out_ch in PER_OUT_CH_OPTS:
                        for size_m in DEFAULT_BATCH_SIZES:
                            mkn = (size_m, size_k, size_n)
                            bench_run(results, model, num_experts, topk,
                                      per_act_token, per_out_ch, mkn)
    compare = benchmark.Compare(results)
    compare.print()
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description="Benchmark Marlin across specified models/shapes/batches")
    parser.add_argument(
        "--models",
        nargs="+",
        type=str,
        default=DEFAULT_MODELS,
        choices=WEIGHT_SHAPES_MOE.keys(),
    )
    parser.add_argument("--tp-sizes",
                        nargs="+",
                        type=int,
                        default=DEFAULT_TP_SIZES)
    parser.add_argument("--batch-sizes",
                        nargs="+",
                        type=int,
                        default=DEFAULT_BATCH_SIZES)
    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
    parser.add_argument("--limit-num-groups", nargs="+", type=int, default=[])
    parser.add_argument("--limit-per-act-token",
                        nargs="+",
                        type=int,
                        default=[])
    parser.add_argument("--limit-per-out-ch", nargs="+", type=int, default=[])
    args = parser.parse_args()
    main(args)
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -40,7 +40,7 @@ def main(num_tokens: int,
        end_time = time.perf_counter()
        if profile:
-            torch.cuda.cudart().cudaProfilerStart()
+            torch.cuda.cudart().cudaProfilerStop()
        return (end_time - start_time) / num_iters
    # Warmup.
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -9,7 +9,7 @@ from dataclasses import dataclass
 from enum import Enum, auto
 from itertools import product
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Any, Callable, Optional
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -17,11 +17,7 @@ from torch.utils.benchmark import Measurement as TMeasurement
 from utils import ArgPool, Bench, CudaGraphBenchParams
 from weight_shapes import WEIGHT_SHAPES
-from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand
+from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
 from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice
 from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink
 from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand
 from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink
 from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
 from vllm.utils import FlexibleArgumentParser
@@ -61,15 +57,15 @@ def make_rand_lora_weight_tensor(k: int,
 def make_rand_tensors(
-    a_shape: Tuple[int],
+    a_shape: tuple[int],
-    b_shape: Tuple[int],
+    b_shape: tuple[int],
-    c_shape: Tuple[int],
+    c_shape: tuple[int],
    a_dtype: torch.dtype,
    b_dtype: torch.dtype,
    c_dtype: torch.dtype,
    num_slices: int,
    device: str = "cuda",
-) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]:
+) -> tuple[torch.Tensor, list[torch.Tensor], torch.Tensor]:
    """
    Make LoRA input/output matrices.
    """
@@ -89,7 +85,7 @@ def make_prompt_lora_mapping(num_prompts: int, num_active_loras: int,
                             sort_by_lora_id: bool,
                             device: str) -> torch.Tensor:
    """
-    All prompts are mapped to a Lora ID in range [0, num_active_loras).
+    All prompts are mapped to a LoRA ID in range [0, num_active_loras).
    where 0 refers to first lora, 1 refers to second lora and so on.
    """
    assert num_active_loras > 0
@@ -135,7 +131,7 @@ def make_token_lora_mapping(num_tokens: int, num_prompts: int,
 def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor,
-                   lora_weights: List[torch.Tensor],
+                   lora_weights: list[torch.Tensor],
                   seq_lens_cpu: torch.Tensor,
                   prompt_lora_mapping_cpu: torch.Tensor, scaling: float,
                   add_inputs: Optional[bool]):
@@ -153,7 +149,6 @@ def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor,
        result = torch.nn.functional.linear(x, w)
        result *= scaling
        out_list.append(result)
    torch.cat(out_list, dim=0)
    cat_result = torch.cat(out_list, dim=0)
@@ -167,62 +162,35 @@ class OpType(Enum):
    """
    LoRA Ops to benchmark and its properties.
    """
-    SGMV_SHRINK = auto()
+    LORA_SHRINK = auto()
-    BGMV_SHRINK = auto()
+    LORA_EXPAND = auto()
    SGMV_EXPAND = auto()
    BGMV_EXPAND = auto()
    BGMV_EXPAND_SLICE = auto()
    @staticmethod
    def from_str(s: str) -> "OpType":
-        if s.lower() == 'sgmv_shrink':
+        if s.lower() == "lora_shrink":
-            return OpType.SGMV_SHRINK
+            return OpType.LORA_SHRINK
-        if s.lower() == 'sgmv_expand':
+        if s.lower() == "lora_expand":
-            return OpType.SGMV_EXPAND
+            return OpType.LORA_EXPAND
        if s.lower() == 'bgmv_shrink':
            return OpType.BGMV_SHRINK
        if s.lower() == 'bgmv_expand':
            return OpType.BGMV_EXPAND
        if s.lower() == "bgmv_expand_slice":
            return OpType.BGMV_EXPAND_SLICE
        raise ValueError(f"Unrecognized str {s} to convert to OpType")
    def is_shrink_fn(self) -> bool:
-        return self in [OpType.SGMV_SHRINK, OpType.BGMV_SHRINK]
+        return self in [OpType.LORA_SHRINK]
    def is_expand_fn(self) -> bool:
-        return self in [OpType.SGMV_EXPAND, OpType.BGMV_EXPAND]
+        return self in [OpType.LORA_EXPAND]
-    def is_prefill_op(self) -> bool:
+    def num_slices(self) -> list[int]:
-        return self in [OpType.SGMV_SHRINK, OpType.SGMV_EXPAND]
+        return [1, 2, 3]
    def is_decode_op(self) -> bool:
        return self in [
            OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE
        ]
    def is_expand_slice_fn(self) -> bool:
        return self in [OpType.BGMV_EXPAND_SLICE]
    def num_slices(self) -> List[int]:
        if self in [OpType.SGMV_EXPAND, OpType.SGMV_SHRINK]:
            # SGMV kernels supports slices
            return [1, 2, 3]
        if self in [OpType.BGMV_SHRINK, OpType.BGMV_EXPAND]:
            return [1]
        if self in [OpType.BGMV_EXPAND_SLICE]:
            return [2, 3]
        raise ValueError(f"Unrecognized OpType {self}")
    def mkn(self, batch_size: int, seq_length: int, hidden_size: int,
-            lora_rank: int) -> Tuple[int, int, int]:
+            lora_rank: int) -> tuple[int, int, int]:
        num_tokens = batch_size * seq_length
        if self.is_shrink_fn():
            m = num_tokens
            k = hidden_size
            n = lora_rank
        else:
-            assert self.is_expand_fn() or self.is_expand_slice_fn()
+            assert self.is_expand_fn()
            m = num_tokens
            k = lora_rank
            n = hidden_size
@@ -230,20 +198,20 @@ class OpType(Enum):
    def matmul_dtypes(
            self, op_dtype: torch.dtype
-    ) -> Tuple[torch.dtype, torch.dtype, torch.dtype]:
+    ) -> tuple[torch.dtype, torch.dtype, torch.dtype]:
        """
        return a type, b type and c type for A x B = C
        """
        if self.is_shrink_fn():
            return op_dtype, op_dtype, torch.float32
        else:
-            assert self.is_expand_fn() or self.is_expand_slice_fn()
+            assert self.is_expand_fn()
            return torch.float32, op_dtype, op_dtype
    def matmul_shapes(
            self, batch_size: int, seq_length: int, hidden_size: int,
            lora_rank: int, num_loras: int,
-            num_slices: int) -> Tuple[Tuple[int], Tuple[int], Tuple[int]]:
+            num_slices: int) -> tuple[tuple[int], tuple[int], tuple[int]]:
        """
        Given num_slices, return the shapes of the A, B, and C matrices
        in A x B = C, for the op_type
@@ -251,56 +219,39 @@ class OpType(Enum):
        m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank)
        b_shape = (num_loras, n, k)  # col-major
-        if self == OpType.SGMV_SHRINK:
+        if self in [OpType.LORA_SHRINK]:
-            # SGMV shrink supports num_slices inherently in the kernel
+            # LoRA shrink kernels support num_slices inherently in the kernel.
            return ((m, k), b_shape, (num_slices, m, n))
-        if self == OpType.SGMV_EXPAND:
+        if self in [OpType.LORA_EXPAND]:
-            # SGMV expand supports num_slices inherently in the kernel
+            # LoRA expand kernels support num_slices inherently in the kernel
            return ((num_slices, m, k), b_shape, (m, n * num_slices))
        if self == OpType.BGMV_SHRINK:
            return ((m, k), b_shape, (m, n))
        if self == OpType.BGMV_EXPAND:
            return ((m, k), b_shape, (m, n))
        if self == OpType.BGMV_EXPAND_SLICE:
            return ((num_slices, m, k), b_shape, (m, n * num_slices))
        raise ValueError(f"Unrecognized op_type {self}")
    def bench_fn(self) -> Callable:
        if self == OpType.LORA_SHRINK:
            return lora_shrink
        if self == OpType.LORA_EXPAND:
            return lora_expand
        def emulate_bgmv_expand_slice(kwargs_list: List[Dict[str, Any]]):
            for x in kwargs_list:
                bgmv_expand_slice(**x)
        if self == OpType.SGMV_SHRINK:
            return sgmv_shrink
        if self == OpType.SGMV_EXPAND:
            return sgmv_expand
        if self == OpType.BGMV_SHRINK:
            return bgmv_shrink
        if self == OpType.BGMV_EXPAND:
            return bgmv_expand
        if self == OpType.BGMV_EXPAND_SLICE:
            return emulate_bgmv_expand_slice
        raise ValueError(f"Unrecognized optype {self}")
    def run_ref_group_gemm(self, output: torch.Tensor, input: torch.Tensor,
-                           lora_weights: List[torch.Tensor],
+                           lora_weights: list[torch.Tensor],
                           **kwargs) -> Callable:
-        """Each benchmark operation expected the input, lora_weights and outputs
+        """Each benchmark operation expects the input, lora_weights and outputs
           in a slightly different format. Refer to self.matmul_shapes().
           run_ref_group_gemm accounts for those differences in executing a
           reference group gemm for correctness testing.
        """
        w_dtype = lora_weights[0].dtype
        num_slices = len(lora_weights)
-        if self == OpType.SGMV_SHRINK:
+        if self in [OpType.LORA_SHRINK]:
            for slice_idx in range(num_slices):
                ref_group_gemm(ref_out=output[slice_idx, :],
                               input=input,
                               lora_weights=lora_weights[slice_idx],
                               **kwargs)
-        if self == OpType.SGMV_EXPAND:
+        elif self in [OpType.LORA_EXPAND]:
            hidden_size = lora_weights[0].shape[1]
            for slice_idx in range(num_slices):
                slice_offset = slice_idx * hidden_size
@@ -309,28 +260,8 @@ class OpType(Enum):
                    input=input[slice_idx].clone().to(dtype=w_dtype),
                    lora_weights=lora_weights[slice_idx],
                    **kwargs)
-        if self == OpType.BGMV_SHRINK:
+        else:
-            assert num_slices == 1
+            raise ValueError(f"Unrecognized optype {self}")
            ref_group_gemm(ref_out=output,
                           input=input,
                           lora_weights=lora_weights[0],
                           **kwargs)
        if self == OpType.BGMV_EXPAND:
            assert num_slices == 1
            ref_group_gemm(ref_out=output,
                           input=input.clone().to(dtype=w_dtype),
                           lora_weights=lora_weights[0],
                           **kwargs)
        if self == OpType.BGMV_EXPAND_SLICE:
            hidden_size = lora_weights[0].shape[1]
            for slice_idx in range(num_slices):
                slice_offset = slice_idx * hidden_size
                ref_group_gemm(
                    ref_out=output[:, slice_offset:slice_offset + hidden_size],
                    input=input[slice_idx].clone().to(dtype=w_dtype),
                    lora_weights=lora_weights[slice_idx],
                    **kwargs)
        raise ValueError(f"Unrecognized optype {self}")
@dataclass
@@ -384,13 +315,13 @@ class BenchmarkTensors:
    """
    # matmul tensors
    input: torch.Tensor
-    lora_weights_lst: List[torch.Tensor]
+    lora_weights_lst: list[torch.Tensor]
    output: torch.Tensor
-    # metadata tensors
+    # LoRA kernel metadata
    lora_kernel_meta: LoRAKernelMeta
    # Metadata tensors used in testing correctness
    seq_lens: torch.Tensor
    seq_start_loc: torch.Tensor
    prompt_lora_mapping: torch.Tensor
    token_lora_mapping: torch.Tensor
    def io_types(self) -> str:
        return (f"{dtype_to_str(self.input.dtype)}x"
@@ -417,26 +348,29 @@ class BenchmarkTensors:
        assert ctx.num_active_loras <= ctx.num_loras
        total_tokens = ctx.batch_size * ctx.seq_length
        # Make metadata tensors involved in correctness testing.
        # Prepare seq lens tensor
        seq_len_tensor = torch.randint(ctx.seq_length, ctx.seq_length + 1,
                                       (ctx.batch_size, ))
        # Prepare seq_start_loc tensor
        seq_start_loc_tensor = torch.cumsum(torch.tensor(
            [0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
                                            dim=0)
        assert total_tokens == seq_len_tensor.sum()
        # Prepare prompt lora indices tensor
        prompt_lora_indices_tensor = make_prompt_lora_mapping(
            ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu")
-        # Prepare token lora indices tensor
+
        # Make LoRAKernelMeta
        token_lora_indices_tensor = make_token_lora_mapping(
            total_tokens, ctx.batch_size, prompt_lora_indices_tensor,
            seq_len_tensor, "cpu")
        lora_kernel_meta = LoRAKernelMeta.make(
            max_loras=ctx.num_loras,
            max_num_tokens=token_lora_indices_tensor.size(0),
            device="cpu")
        lora_kernel_meta.prepare_tensors(
            token_lora_mapping=token_lora_indices_tensor)
        return BenchmarkTensors(input_tensor, lora_weights, output_tensor,
-                                seq_len_tensor, seq_start_loc_tensor,
+                                lora_kernel_meta, seq_len_tensor,
-                                prompt_lora_indices_tensor,
+                                prompt_lora_indices_tensor)
                                token_lora_indices_tensor)
    def sanity_check(self) -> None:
        """
@@ -446,9 +380,9 @@ class BenchmarkTensors:
        # check metadata tensors
        assert torch.sum(self.seq_lens) == num_tokens
        num_seqs = self.seq_lens.shape[0]
-        assert self.seq_start_loc.shape[0] == num_seqs
+        #assert self.seq_start_loc.shape[0] == num_seqs
        assert self.prompt_lora_mapping.shape[0] == num_seqs
-        assert self.token_lora_mapping.shape[0] == num_tokens
+        assert self.lora_kernel_meta.token_lora_mapping.shape[0] == num_tokens
    def to_device(self, device: str):
        """
@@ -463,54 +397,31 @@ class BenchmarkTensors:
        self.input = to_device(self.input)
        self.output = to_device(self.output)
        self.seq_lens = to_device(self.seq_lens)
        self.seq_start_loc = to_device(self.seq_start_loc)
        self.prompt_lora_mapping = to_device(self.prompt_lora_mapping)
        self.token_lora_mapping = to_device(self.token_lora_mapping)
        for i in range(len(self.lora_weights_lst)):
            self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i])
-    def metadata(self) -> Tuple[int, int, int]:
+        # LoRA meta
        for field_name in LoRAKernelMeta.__dataclass_fields__:
            field = getattr(self.lora_kernel_meta, field_name)
            assert isinstance(field, torch.Tensor)
            setattr(self.lora_kernel_meta, field_name, to_device(field))
    def metadata(self) -> tuple[int, int, int]:
        """
        Return num_seqs, num_tokens and max_seq_len
        """
        num_seqs = self.seq_lens.shape[0]
-        num_tokens = self.token_lora_mapping.shape[0]
+        num_tokens = self.lora_kernel_meta.token_lora_mapping.shape[0]
        max_seq_len = torch.max(self.seq_lens).item()
        num_slices = len(self.lora_weights_lst)
        return num_seqs, num_tokens, max_seq_len, num_slices
-    def convert_to_sgmv_benchmark_tensors(self):
+    def as_lora_shrink_kwargs(self) -> dict[str, Any]:
        """
        For sgmv punica kernels, when consecutive sequences have the
        same LoRA ID, we just merge them together.
        This happens in punica.py::compute_metadata
        """
        # Collapse seq_lens and seq_start_loc
        _, seq_lens = torch.unique_consecutive(self.token_lora_mapping,
                                               return_counts=True)
        cum_result = torch.cumsum(seq_lens, dim=0)
        seq_start_loc = torch.zeros_like(seq_lens)
        seq_start_loc[1:].copy_(cum_result[:-1])
        # Collapse prompt mapping
        prompt_lora_mapping = torch.unique_consecutive(
            self.prompt_lora_mapping)
        assert torch.sum(seq_lens) == torch.sum(self.seq_lens), \
         f"dont match - new {torch.sum(seq_lens)} vs {torch.sum(self.seq_lens)}"
        self.prompt_lora_mapping = prompt_lora_mapping.to(
            dtype=self.prompt_lora_mapping.dtype)
        self.seq_lens = seq_lens.to(dtype=self.seq_lens.dtype)
        self.seq_start_loc = seq_start_loc.to(dtype=self.seq_start_loc.dtype)
    def as_sgmv_shrink_kwargs(self) -> Dict[str, Any]:
        self.convert_to_sgmv_benchmark_tensors()
        self.sanity_check()
        self.to_device(self.input.device)
-        num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
+        _, num_tokens, _, num_slices = self.metadata()
        # Sanity check matrix shapes.
        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
@@ -531,22 +442,20 @@ class BenchmarkTensors:
            'inputs': self.input,
            'lora_a_weights': self.lora_weights_lst,
            'output_tensor': self.output,
-            'b_seq_start_loc': self.seq_start_loc,
+            'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping,
-            'seq_len_tensor': self.seq_lens,
+            'token_indices_sorted_by_lora_ids':
-            'lora_indices_tensor': self.prompt_lora_mapping,
+            self.lora_kernel_meta.token_indices_sorted_by_lora_ids,
-            'batches': num_seqs,
+            'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora,
-            'max_seq_length': max_seq_len,
+            'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc,
-            'token_nums': num_tokens,
+            'lora_ids': self.lora_kernel_meta.active_lora_ids,
            'scaling': 1.0,
        }
-    def as_sgmv_expand_kwargs(self, add_inputs: bool) -> Dict[str, Any]:
+    def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
        self.convert_to_sgmv_benchmark_tensors()
        self.sanity_check()
        self.to_device(self.input.device)
-        num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
+        _, num_tokens, _, num_slices = self.metadata()
        # Sanity check matrix shapes.
        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
@@ -568,124 +477,28 @@ class BenchmarkTensors:
            'inputs': self.input,
            'lora_b_weights': self.lora_weights_lst,
            'output_tensor': self.output,
-            'b_seq_start_loc': self.seq_start_loc,
+            'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping,
-            'seq_len_tensor': self.seq_lens,
+            'token_indices_sorted_by_lora_ids':
-            'lora_indices_tensor': self.prompt_lora_mapping,
+            self.lora_kernel_meta.token_indices_sorted_by_lora_ids,
-            'batches': num_seqs,
+            'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora,
-            'max_seq_length': max_seq_len,
+            'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc,
-            'token_nums': num_tokens,
+            'lora_ids': self.lora_kernel_meta.active_lora_ids,
            'offset_start': 0,
            'add_inputs': add_inputs,
        }
    def as_bgmv_shrink_kwargs(self) -> Dict[str, Any]:
        assert len(self.lora_weights_lst) == 1
        self.to_device(self.input.device)
        _, num_tokens, _, _ = self.metadata()
        # Sanity check shapes
        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
            0].shape, self.output.shape
        # Expected input shape [num_tokens, hidden_size]
        assert len(i_shape) == 2
        assert i_shape[0] == num_tokens
        hidden_size = i_shape[1]
        # Expected lora weight shape [num_loras, lora_rank, hidden_size]
        assert len(lw_shape) == 3
        assert lw_shape[2] == hidden_size
        lora_rank = lw_shape[1]
        # Expected output shape [num_tokens, lora_rank]
        assert len(o_shape) == 2
        assert o_shape == (num_tokens, lora_rank)
        return {
            'inputs': self.input,
            'lora_a_weights': self.lora_weights_lst[0],
            'output_tensor': self.output,
            'lora_indices_tensor': self.token_lora_mapping,
            'scaling': 1.0
        }
    def as_bgmv_expand_kwargs(self, add_inputs: bool):
        assert len(self.lora_weights_lst) == 1
        self.to_device(self.input.device)
        _, num_tokens, _, _ = self.metadata()
        # Sanity check shapes
        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
            0].shape, self.output.shape
        # Expected input shape [num_tokens, lora_rank]
        assert len(i_shape) == 2
        assert i_shape[0] == num_tokens
        lora_rank = i_shape[1]
        # Expected lora weight shape [num_loras, hidden_size, lora_rank]
        assert len(lw_shape) == 3
        assert lw_shape[2] == lora_rank
        hidden_size = lw_shape[1]
        # Expected output shape [num_tokens, hidden_size]
        assert len(o_shape) == 2
        assert o_shape == (num_tokens, hidden_size)
        return {
            'inputs': self.input,
            'lora_b_weights': self.lora_weights_lst[0],
            'output_tensor': self.output,
            'lora_indices_tensor': self.token_lora_mapping,
            'add_inputs': add_inputs
        }
    def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> Dict[str, Any]:
        _, num_tokens, _, num_slices = self.metadata()
        # Sanity check shapes
        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
            0].shape, self.output.shape
        # Expected input shape [num_slices, num_tokens, lora_rank]
        assert len(i_shape) == 3
        assert i_shape[0] == num_slices
        assert i_shape[1] == num_tokens
        lora_rank = i_shape[2]
        # Expected lora weight shape [num_loras, hidden_size, lora_rank]
        assert len(lw_shape) == 3
        assert lw_shape[2] == lora_rank
        hidden_size = lw_shape[1]
        # Expected output shape [num_tokens, hidden_size * num_slices]
        assert len(o_shape) == 2
        assert o_shape == (num_tokens, hidden_size * num_slices)
        self.to_device(self.input.device)
        kwargs_list = []
        for i in range(num_slices):
            kwargs_list.append({
                'inputs': self.input[i],
                'lora_b_weights': self.lora_weights_lst[i],
                'output_tensor': self.output,
                'lora_indices_tensor': self.token_lora_mapping,
                'slice_offset': i * hidden_size,
                'slice_size': hidden_size,
                'add_inputs': add_inputs,
            })
        return {'kwargs_list': kwargs_list}
    def bench_fn_kwargs(self,
                        op_type: OpType,
-                        add_inputs: Optional[bool] = None) -> Dict[str, Any]:
+                        add_inputs: Optional[bool] = None) -> dict[str, Any]:
        if op_type.is_shrink_fn():
            assert add_inputs is None
        else:
            assert add_inputs is not None
-        if op_type == OpType.SGMV_SHRINK:
+        if op_type == OpType.LORA_SHRINK:
-            return self.as_sgmv_shrink_kwargs()
+            return self.as_lora_shrink_kwargs()
-        if op_type == OpType.SGMV_EXPAND:
+        if op_type == OpType.LORA_EXPAND:
-            return self.as_sgmv_expand_kwargs(add_inputs)
+            return self.as_lora_expand_kwargs(add_inputs)
        if op_type == OpType.BGMV_SHRINK:
            return self.as_bgmv_shrink_kwargs()
        if op_type == OpType.BGMV_EXPAND:
            return self.as_bgmv_expand_kwargs(add_inputs)
        if op_type == OpType.BGMV_EXPAND_SLICE:
            return self.as_bgmv_expand_slice_kwargs(add_inputs)
        raise ValueError(f"Unrecognized optype {self}")
    def test_correctness(self, op_type: OpType,
@@ -734,7 +547,7 @@ def bench_optype(ctx: BenchmarkContext,
        assert expand_fn_add_inputs is not None
    # BenchmarkContext -> BenchmarkTensors
-    bench_tensors : List[BenchmarkTensors] = \
+    bench_tensors : list[BenchmarkTensors] = \
        [BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)]
    for bt in bench_tensors:
        bt.sanity_check()
@@ -746,7 +559,7 @@ def bench_optype(ctx: BenchmarkContext,
            for bt in bench_tensors
        ])
-    # BenchmarkTensors -> Dict (kwargs)
+    # BenchmarkTensors -> dict (kwargs)
    kwargs_list = [
        bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs)
        for bt in bench_tensors
@@ -841,7 +654,7 @@ def use_cuda_graph_recommendation() -> str:
            """
-def print_timers(timers: List[TMeasurement],
+def print_timers(timers: list[TMeasurement],
                 args: Optional[argparse.Namespace] = None):
    compare = TBenchmark.Compare(timers)
    compare.print()
@@ -861,7 +674,7 @@ def print_timers(timers: List[TMeasurement],
          "small num_loras the goal should be to match the torch.mm numbers.")
-def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]):
+def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]):
    if args.cuda_graph_nops is not None:
        assert args.cuda_graph_nops > 0
@@ -873,14 +686,7 @@ def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]):
    timers = []
    for bench_ctx in bench_ctxs:
        for seq_len in args.seq_lengths:
-            bench_ops: List[OpType] = []
+            bench_ops: list[OpType] = args.op_types
            if seq_len == 1:
                # bench all decode ops
                bench_ops = [op for op in args.op_types if op.is_decode_op()]
            else:
                # bench all prefill ops
                bench_ops = [op for op in args.op_types if op.is_prefill_op()]
            seq_len_timers = []
            for bench_op in bench_ops:
                for num_slices in bench_op.num_slices():
@@ -921,10 +727,10 @@ def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]):
            pickle.dump(timers, f)
-def as_benchmark_contexts(hidden_sizes: List[int], lora_ranks: List[int],
+def as_benchmark_contexts(hidden_sizes: list[int], lora_ranks: list[int],
-                          args: argparse.Namespace) -> List[BenchmarkContext]:
+                          args: argparse.Namespace) -> list[BenchmarkContext]:
-    ctxs: List[BenchmarkContext] = []
+    ctxs: list[BenchmarkContext] = []
    for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product(  # noqa
            args.batch_sizes, list(hidden_sizes), lora_ranks, args.num_loras,
            args.sort_by_lora_id):
@@ -954,7 +760,7 @@ def run_list_bench(args: argparse.Namespace):
          f"  LoRA Ranks {args.lora_ranks}")
    # Get all benchmarking contexts
-    bench_contexts: List[BenchmarkContext] = as_benchmark_contexts(
+    bench_contexts: list[BenchmarkContext] = as_benchmark_contexts(
        hidden_sizes=args.hidden_sizes, lora_ranks=args.lora_ranks, args=args)
    run(args, bench_contexts)
@@ -975,7 +781,7 @@ def run_range_bench(args: argparse.Namespace):
          f" LoRA Ranks {lora_ranks}")
    # Get all benchmarking contexts
-    bench_contexts: List[BenchmarkContext] = as_benchmark_contexts(
+    bench_contexts: list[BenchmarkContext] = as_benchmark_contexts(
        hidden_sizes=hidden_sizes, lora_ranks=lora_ranks, args=args)
    run(args, bench_contexts)
@@ -1002,7 +808,7 @@ def run_model_bench(args: argparse.Namespace):
          f" LoRA Ranks {args.lora_ranks}")
    # Get all benchmarking contexts
-    bench_contexts: List[BenchmarkContext] = as_benchmark_contexts(
+    bench_contexts: list[BenchmarkContext] = as_benchmark_contexts(
        hidden_sizes=hidden_sizes, lora_ranks=args.lora_ranks, args=args)
    run(args, bench_contexts)
@@ -1090,13 +896,13 @@ Benchmark LoRA kernels:
    {use_cuda_graph_recommendation()}
    list_bench example:
-        python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
+        python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
    model_bench example:
-        python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16  --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 
+        python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16  --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 
    range_bench example:
-        python3 benchmarks/kernels/benchmark_lora.py range_bench  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16   --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 
+        python3 benchmarks/kernels/benchmark_lora.py range_bench  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16   --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 
            """,  # noqa: E501
        formatter_class=argparse.RawTextHelpFormatter)
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -7,9 +7,10 @@ import math
 import os
 import pickle as pkl
 import time
 from collections.abc import Iterable
 from dataclasses import dataclass
 from itertools import product
-from typing import Callable, Iterable, List, Optional, Tuple
+from typing import Callable, Optional
 import pandas as pd
 import torch
@@ -44,7 +45,6 @@ def terse_type_name(dt):
        torch.float16: "fp16",
        torch.int8: "int8",
        torch.float8_e4m3fn: "fp8",
        torch.bfloat16: "bf16",
        torch.float: "float",
        torch.int: "int",
    }[dt]
@@ -102,8 +102,8 @@ def quantize_and_pack(atype: torch.dtype,
    return w_ref, w_q, w_s, w_zp
-def create_bench_tensors(shape: Tuple[int, int, int], types: TypeConfig,
+def create_bench_tensors(shape: tuple[int, int, int], types: TypeConfig,
-                         group_size: Optional[int]) -> List[BenchmarkTensors]:
+                         group_size: Optional[int]) -> list[BenchmarkTensors]:
    m, n, k = shape
    # we want to make sure that weights don't fit into L2 cache between runs so
@@ -114,7 +114,7 @@ def create_bench_tensors(shape: Tuple[int, int, int], types: TypeConfig,
    a = rand_data((m, k), types.act_type, scale=5)
-    benchmark_tensors: List[BenchmarkTensors] = []
+    benchmark_tensors: list[BenchmarkTensors] = []
    for _ in range(num_weights):
        w = rand_data((k, n), types.act_type, scale=5)
@@ -258,7 +258,7 @@ def machete_create_bench_fn(bt: BenchmarkTensors,
    return lambda: ops.machete_mm(
        a=bt.a,
-        b_q=bt.w_q,
+        b_q=w_q,
        b_type=bt.wtype,
        b_group_scales=bt.w_g_s,
        b_group_zeros=w_g_zp,
@@ -276,7 +276,7 @@ def machete_create_bench_fn(bt: BenchmarkTensors,
 def bench_fns(label: str, sub_label: str, description: str,
-              fns: List[Callable]):
+              fns: list[Callable]):
    min_run_time = 1 if not NVTX_PROFILE else 0.1
    res = TBenchmark.Timer(
@@ -311,7 +311,7 @@ def bench(types: TypeConfig,
          n: int,
          label: str,
          sub_label: str,
-          sweep_schedules: bool = True) -> List[TMeasurement]:
+          sweep_schedules: bool = True) -> list[TMeasurement]:
    benchmark_tensors = create_bench_tensors((m, n, k), types, group_size)
    sub_label += f", L={len(benchmark_tensors)}"
@@ -414,12 +414,12 @@ def bench(types: TypeConfig,
 # runner
-def print_timers(timers: List[TMeasurement]):
+def print_timers(timers: list[TMeasurement]):
    compare = TBenchmark.Compare(timers)
    compare.print()
-def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+def run(args, MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]:
    types = TypeConfig(
        act_type=args.act_type,
        weight_type=scalar_types.uint4b8 if args.group_zero_type is None \
@@ -431,7 +431,7 @@ def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
        token_scale_type=args.token_scale_type,
    )
-    results: List[TMeasurement] = []
+    results: list[TMeasurement] = []
    for m, k, n in MKNs:
        timers = bench(types,
                       args.group_size,
@@ -449,8 +449,8 @@ def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
 # output makers
 def make_output(
-    data: List[TMeasurement],
+    data: list[TMeasurement],
-    MKNs: Iterable[Tuple[int, int, int]],
+    MKNs: Iterable[tuple[int, int, int]],
    base_description: str,
    timestamp=None,
 ):
@@ -497,7 +497,7 @@ def run_model_bench(args):
    for i, model in enumerate(args.models):
        print(f"[{i}]  {model}")
-    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+    def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]:
        KNs = []
        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 from typing import List
 import torch
 import torch.utils.benchmark as benchmark
 from benchmark_shapes import WEIGHT_SHAPES
@@ -10,6 +8,8 @@ from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
    GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
 from vllm.model_executor.layers.quantization.utils.allspark_utils import (
    ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD, ALLSPARK_SUPPORTED_QUANT_TYPES)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
    MARLIN_SUPPORTED_GROUP_SIZES, query_marlin_supported_quant_types)
@@ -18,18 +18,18 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
    marlin_24_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    gptq_pack, gptq_quantize_weights, sort_weights)
+    gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights)
 from vllm.scalar_type import ScalarType
 from vllm.utils import FlexibleArgumentParser
 DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
-DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
 ACT_ORDER_OPTS = [False, True]
 K_FULL_OPTS = [False, True]
-def bench_run(results: List[benchmark.Measurement], model: str,
+def bench_run(results: list[benchmark.Measurement], model: str,
              act_order: bool, is_k_full: bool, quant_type: ScalarType,
              group_size: int, size_m: int, size_k: int, size_n: int):
    label = "Quant Matmul"
@@ -81,6 +81,27 @@ def bench_run(results: List[benchmark.Measurement], model: str,
                                          GPTQ_MARLIN_24_MAX_PARALLEL)
    marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int)
    # AllSpark W8A16 quant
    as_supported_case = (quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES
                         and group_size == -1 and not act_order and is_k_full)
    if as_supported_case:
        properties = torch.cuda.get_device_properties(b.device.index)
        sm_count = properties.multi_processor_count
        sm_version = properties.major * 10 + properties.minor
        supported_arch = (sm_version >= 80 and sm_version < 90)
        as_supported_case = as_supported_case and supported_arch
        if supported_arch:
            has_zp = False
            w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size,
                                                has_zp)
            qw = qw.to(torch.uint8)
            qw_reorder, s_reorder, zp_reorder = \
                ops.allspark_repack_weight(
                qw, s, zp, has_zp)
            CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD
    globals = {
        # Gen params
        "quant_type": quant_type,
@@ -109,10 +130,19 @@ def bench_run(results: List[benchmark.Measurement], model: str,
        # GPTQ params
        "q_w_gptq": q_w_gptq,
        "repack_sort_indices": repack_sort_indices,
        # AllSpark W8A16 params
        "qw_reorder": qw_reorder if as_supported_case else None,
        "s_reorder": s_reorder if as_supported_case else None,
        "zp_reorder": zp_reorder if as_supported_case else None,
        "sm_count": sm_count if as_supported_case else None,
        "sm_version": sm_version if as_supported_case else None,
        "CUBLAS_M_THRESHOLD":
        CUBLAS_M_THRESHOLD if as_supported_case else None,
        # Kernels
        "gptq_marlin_gemm": ops.gptq_marlin_gemm,
        "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
        "gptq_marlin_repack": ops.gptq_marlin_repack,
        "allspark_w8a16_gemm": ops.allspark_w8a16_gemm,
    }
    min_run_time = 1
@@ -172,13 +202,24 @@ def bench_run(results: List[benchmark.Measurement], model: str,
            description="gptq_marlin_repack",
        ).blocked_autorange(min_run_time=min_run_time))
    if as_supported_case:
        results.append(
            benchmark.Timer(
                stmt=
                "output = allspark_w8a16_gemm(a, qw_reorder, s_reorder, zp_reorder, size_n, group_size, sm_count, sm_version, CUBLAS_M_THRESHOLD, False, True)",  # noqa: E501
                globals=globals,
                label=label,
                sub_label=sub_label,
                description="allspark_w8a16_gemm_fp32",
            ).blocked_autorange(min_run_time=min_run_time))
 def main(args):
    print("Benchmarking models:")
    for i, model in enumerate(args.models):
        print(f"[{i}]  {model}")
-    results: List[benchmark.Measurement] = []
+    results: list[benchmark.Measurement] = []
    for model in args.models:
        for layer in WEIGHT_SHAPES[model]:
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -1,10 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 import argparse
 import json
 import time
 from contextlib import nullcontext
 from datetime import datetime
 from itertools import product
-from typing import Any, Dict, List, Tuple, TypedDict
+from typing import Any, TypedDict
 import ray
 import torch
@@ -16,8 +18,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import *
 from vllm.platforms import current_platform
 from vllm.utils import FlexibleArgumentParser
-FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm(
+FP8_DTYPE = current_platform.fp8_dtype()
 ) else torch.float8_e4m3fn
 class BenchmarkConfig(TypedDict):
@@ -29,18 +30,18 @@ class BenchmarkConfig(TypedDict):
    num_stages: int
-def benchmark_config(
+def benchmark_config(config: BenchmarkConfig,
-    config: BenchmarkConfig,
+                     num_tokens: int,
-    num_tokens: int,
+                     num_experts: int,
-    num_experts: int,
+                     shard_intermediate_size: int,
-    shard_intermediate_size: int,
+                     hidden_size: int,
-    hidden_size: int,
+                     topk: int,
-    topk: int,
+                     dtype: torch.dtype,
-    dtype: torch.dtype,
+                     use_fp8_w8a8: bool,
-    use_fp8_w8a8: bool,
+                     use_int8_w8a16: bool,
-    use_int8_w8a16: bool,
+                     num_iters: int = 100,
-    num_iters: int = 100,
+                     block_quant_shape: List[int] = None,
-) -> float:
+                     use_deep_gemm: bool = False) -> float:
    init_dtype = torch.float16 if use_fp8_w8a8 else dtype
    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
    if use_int8_w8a16:
@@ -81,8 +82,24 @@ def benchmark_config(
                               dtype=torch.float32)
        w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32)
    if use_fp8_w8a8:
-        w1_scale = torch.randn(num_experts, dtype=torch.float32)
+        if block_quant_shape:
-        w2_scale = torch.randn(num_experts, dtype=torch.float32)
+            block_n, block_k = block_quant_shape[0], block_quant_shape[1]
            E = num_experts
            N = shard_intermediate_size // 2
            K = hidden_size
            factor_for_scale = 1e-2
            n_tiles_w1 = (2 * N + block_n - 1) // block_n
            n_tiles_w2 = (K + block_n - 1) // block_n
            k_tiles_w1 = (K + block_k - 1) // block_k
            k_tiles_w2 = (N + block_k - 1) // block_k
            w1_scale = torch.rand((E, n_tiles_w1, k_tiles_w1),
                                  dtype=torch.float32) * factor_for_scale
            w2_scale = torch.rand((E, n_tiles_w2, k_tiles_w2),
                                  dtype=torch.float32) * factor_for_scale
        else:
            w1_scale = torch.randn(num_experts, dtype=torch.float32)
            w2_scale = torch.randn(num_experts, dtype=torch.float32)
        a1_scale = torch.randn(1, dtype=torch.float32)
        a2_scale = torch.randn(1, dtype=torch.float32)
@@ -97,21 +114,41 @@ def benchmark_config(
    def run():
        from vllm.model_executor.layers.fused_moe import override_config
        with override_config(config):
-            fused_moe(
+            if use_deep_gemm:
-                x,
+                topk_weights, topk_ids = fused_topk(x, input_gating, topk,
-                w1,
+                                                    False)
-                w2,
+                return fused_experts(
-                input_gating,
+                    x,
-                topk,
+                    w1,
-                renormalize=True,
+                    w2,
-                inplace=True,
+                    topk_weights,
-                use_fp8_w8a8=use_fp8_w8a8,
+                    topk_ids,
-                use_int8_w8a16=use_int8_w8a16,
+                    inplace=True,
-                w1_scale=w1_scale,
+                    use_fp8_w8a8=use_fp8_w8a8,
-                w2_scale=w2_scale,
+                    w1_scale=w1_scale,
-                a1_scale=a1_scale,
+                    w2_scale=w2_scale,
-                a2_scale=a2_scale,
+                    a1_scale=a1_scale,
-            )
+                    a2_scale=a2_scale,
                    block_shape=block_quant_shape,
                    allow_deep_gemm=True,
                )
            else:
                fused_moe(
                    x,
                    w1,
                    w2,
                    input_gating,
                    topk,
                    renormalize=True,
                    inplace=True,
                    use_fp8_w8a8=use_fp8_w8a8,
                    use_int8_w8a16=use_int8_w8a16,
                    w1_scale=w1_scale,
                    w2_scale=w2_scale,
                    a1_scale=a1_scale,
                    a2_scale=a2_scale,
                    block_shape=block_quant_shape,
                )
    # JIT compilation & warmup
    run()
@@ -132,7 +169,7 @@ def benchmark_config(
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)
-    latencies: List[float] = []
+    latencies: list[float] = []
    for i in range(num_iters):
        prepare(i)
        torch.cuda.synchronize()
@@ -175,8 +212,9 @@ def get_rocm_tuning_space(use_fp16):
    return param_ranges
-def get_configs_compute_bound(use_fp16) -> List[Dict[str, int]]:
+def get_configs_compute_bound(use_fp16,
-    configs: List[BenchmarkConfig] = []
+                              block_quant_shape) -> list[dict[str, int]]:
    configs: list[BenchmarkConfig] = []
    if current_platform.is_rocm():
        param_ranges = get_rocm_tuning_space(use_fp16)
@@ -204,17 +242,27 @@ def get_configs_compute_bound(use_fp16) -> List[Dict[str, int]]:
    for config_values in product(*values):
        config = dict(zip(keys, config_values))
        configs.append(config)
    # Remove configs that are not compatible with fp8 block quantization
    # BLOCK_SIZE_K must be a multiple of block_k
    # BLOCK_SIZE_N must be a multiple of block_n
    if block_quant_shape is not None and not use_fp16:
        block_n, block_k = block_quant_shape[0], block_quant_shape[1]
        for config in configs[:]:
            if config["BLOCK_SIZE_K"] % block_k != 0 or config[
                    "BLOCK_SIZE_N"] % block_n != 0:
                configs.remove(config)
    return configs
 def prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size,
-                            search_space, is_fp16):
+                            search_space, is_fp16, topk):
    N1, K1 = shard_intermediate_size, hidden_size
    N2, K2 = hidden_size, shard_intermediate_size // 2
-    pruned_space_1 = prune_rocm_configs(num_tokens * 2, N1, K1, search_space,
+    pruned_space_1 = prune_rocm_configs(num_tokens * topk, N1, K1,
-                                        is_fp16)
+                                        search_space, is_fp16)
-    pruned_space_2 = prune_rocm_configs(num_tokens * 2, N2, K2, search_space,
+    pruned_space_2 = prune_rocm_configs(num_tokens * topk, N2, K2,
-                                        is_fp16)
+                                        search_space, is_fp16)
    search_space = merge_unique_dicts(pruned_space_1, pruned_space_2)
    return search_space
@@ -335,7 +383,9 @@ class BenchmarkWorker:
        dtype: torch.dtype,
        use_fp8_w8a8: bool,
        use_int8_w8a16: bool,
-    ) -> Tuple[Dict[str, int], float]:
+        block_quant_shape: List[int] = None,
        use_deep_gemm: bool = False,
    ) -> tuple[dict[str, int], float]:
        current_platform.seed_everything(self.seed)
        dtype_str = get_config_dtype_str(dtype,
                                         use_int8_w8a16=use_int8_w8a16,
@@ -355,10 +405,18 @@ class BenchmarkWorker:
        else:
            config = op_config[min(op_config.keys(),
                                   key=lambda x: abs(x - num_tokens))]
-        kernel_time = benchmark_config(config, num_tokens, num_experts,
+        kernel_time = benchmark_config(config,
-                                       shard_intermediate_size, hidden_size,
+                                       num_tokens,
-                                       topk, dtype, use_fp8_w8a8,
+                                       num_experts,
-                                       use_int8_w8a16)
+                                       shard_intermediate_size,
                                       hidden_size,
                                       topk,
                                       dtype,
                                       use_fp8_w8a8,
                                       use_int8_w8a16,
                                       num_iters=100,
                                       block_quant_shape=block_quant_shape,
                                       use_deep_gemm=use_deep_gemm)
        return config, kernel_time
    def tune(
@@ -371,8 +429,10 @@ class BenchmarkWorker:
        dtype: torch.dtype,
        use_fp8_w8a8: bool,
        use_int8_w8a16: bool,
-        search_space: List[Dict[str, int]],
+        search_space: list[dict[str, int]],
-    ) -> Dict[str, int]:
+        block_quant_shape: list[int],
        use_deep_gemm: bool,
    ) -> dict[str, int]:
        best_config = None
        best_time = float("inf")
        if current_platform.is_rocm():
@@ -380,21 +440,25 @@ class BenchmarkWorker:
            search_space = prune_rocm_search_space(num_tokens,
                                                   shard_intermediate_size,
                                                   hidden_size, search_space,
-                                                   is_fp16)
+                                                   is_fp16, topk)
-        with torch.cuda.device(self.device_id):
+        with torch.cuda.device(self.device_id) if current_platform.is_rocm(
        ) else nullcontext():
            for config in tqdm(search_space):
                try:
-                    kernel_time = benchmark_config(config,
+                    kernel_time = benchmark_config(
-                                                   num_tokens,
+                        config,
-                                                   num_experts,
+                        num_tokens,
-                                                   shard_intermediate_size,
+                        num_experts,
-                                                   hidden_size,
+                        shard_intermediate_size,
-                                                   topk,
+                        hidden_size,
-                                                   dtype,
+                        topk,
-                                                   use_fp8_w8a8,
+                        dtype,
-                                                   use_int8_w8a16,
+                        use_fp8_w8a8,
-                                                   num_iters=20)
+                        use_int8_w8a16,
                        num_iters=20,
                        block_quant_shape=block_quant_shape,
                        use_deep_gemm=use_deep_gemm)
                except triton.runtime.autotuner.OutOfResources:
                    # Some configurations may be invalid and fail to compile.
                    continue
@@ -434,10 +498,10 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
    }
-def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int,
+def save_configs(configs: dict[int, BenchmarkConfig], num_experts: int,
                 shard_intermediate_size: int, hidden_size: int, topk: int,
-                 dtype: torch.dtype, use_fp8_w8a8: bool,
+                 dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool,
-                 use_int8_w8a16: bool) -> None:
+                 block_quant_shape: List[int]) -> None:
    dtype_str = get_config_dtype_str(dtype,
                                     use_int8_w8a16=use_int8_w8a16,
                                     use_fp8_w8a8=use_fp8_w8a8)
@@ -445,7 +509,7 @@ def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int,
    # NOTE(woosuk): The current naming convention uses w2.shape[2], which
    # is the intermediate size after silu_and_mul.
    filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
-                                    dtype_str)
+                                    dtype_str, block_quant_shape)
    print(f"Writing best config to {filename}...")
    with open(filename, "w") as f:
@@ -453,9 +517,17 @@ def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int,
        f.write("\n")
 def get_weight_block_size_safety(config, default_value=None):
    quantization_config = getattr(config, 'quantization_config', {})
    if isinstance(quantization_config, dict):
        return quantization_config.get('weight_block_size', default_value)
    return default_value
 def main(args: argparse.Namespace):
    print(args)
-
+    block_quant_shape = None
    config = AutoConfig.from_pretrained(
        args.model, trust_remote_code=args.trust_remote_code)
    if config.architectures[0] == "DbrxForCausalLM":
@@ -468,12 +540,22 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok
        intermediate_size = config.intermediate_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    elif config.architectures[0] == "DeepseekV3ForCausalLM":
+    elif (config.architectures[0] == "DeepseekV3ForCausalLM"
          or config.architectures[0] == "DeepseekV2ForCausalLM"):
        E = config.n_routed_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
        block_quant_shape = get_weight_block_size_safety(config)
    elif config.architectures[0] == "Qwen2MoeForCausalLM":
        E = config.num_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
    else:
        if not hasattr(config, "hidden_size"):
            # Support for llama4
            config = config.text_config
        # Default: Mixtral.
        E = config.num_local_experts
        topk = config.num_experts_per_tok
@@ -493,11 +575,13 @@ def main(args: argparse.Namespace):
    else:
        batch_sizes = [args.batch_size]
    use_deep_gemm = bool(args.use_deep_gemm)
    ray.init()
    num_gpus = int(ray.available_resources()["GPU"])
    workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
-    def _distribute(method: str, inputs: List[Any]) -> List[Any]:
+    def _distribute(method: str, inputs: list[Any]) -> list[Any]:
        outputs = []
        worker_idx = 0
        for input_args in inputs:
@@ -510,27 +594,30 @@ def main(args: argparse.Namespace):
    if args.tune:
        is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
-        search_space = get_configs_compute_bound(is_fp16)
+        search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
        print(f"Start tuning over {len(search_space)} configurations...")
        start = time.time()
        configs = _distribute(
            "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
-                      topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space)
+                      topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space,
                      block_quant_shape, use_deep_gemm)
                     for batch_size in batch_sizes])
        best_configs = {
            M: sort_config(config)
            for M, config in zip(batch_sizes, configs)
        }
        save_configs(best_configs, E, shard_intermediate_size, hidden_size,
-                     topk, dtype, use_fp8_w8a8, use_int8_w8a16)
+                     topk, dtype, use_fp8_w8a8, use_int8_w8a16,
                     block_quant_shape)
        end = time.time()
        print(f"Tuning took {end - start:.2f} seconds")
    else:
        outputs = _distribute(
-            "benchmark", [(batch_size, E, shard_intermediate_size, hidden_size,
+            "benchmark",
-                           topk, dtype, use_fp8_w8a8, use_int8_w8a16)
+            [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
-                          for batch_size in batch_sizes])
+              use_fp8_w8a8, use_int8_w8a16, block_quant_shape, use_deep_gemm)
             for batch_size in batch_sizes])
        for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
            print(f"Batch size: {batch_size}, config: {config}")
@@ -551,6 +638,7 @@ if __name__ == "__main__":
                        type=str,
                        choices=["auto", "fp8_w8a8", "int8_w8a16"],
                        default="auto")
    parser.add_argument("--use-deep-gemm", action="store_true")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--batch-size", type=int, required=False)
    parser.add_argument("--tune", action="store_true")
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -2,17 +2,21 @@
 import random
 import time
-from typing import List, Optional
+from typing import Optional
 import torch
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
                        create_kv_caches_with_random)
-NUM_BLOCKS = 1024
+logger = init_logger(__name__)
 NUM_BLOCKS = 128 * 1024
 PARTITION_SIZE = 512
 PARTITION_SIZE_ROCM = 256
@torch.inference_mode()
@@ -54,7 +58,7 @@ def main(
    # Create the block tables.
    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
-    block_tables_lst: List[List[int]] = []
+    block_tables_lst: list[list[int]] = []
    for _ in range(num_seqs):
        block_table = [
            random.randint(0, NUM_BLOCKS - 1)
@@ -80,6 +84,12 @@ def main(
    # Prepare for the paged attention kernel.
    output = torch.empty_like(query)
    if version == "v2":
        if current_platform.is_rocm():
            global PARTITION_SIZE
            if not args.custom_paged_attn:
                PARTITION_SIZE = 1024
            else:
                PARTITION_SIZE = PARTITION_SIZE_ROCM
        num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
        tmp_output = torch.empty(
            size=(num_seqs, num_query_heads, num_partitions, head_size),
@@ -123,32 +133,53 @@ def main(
                    v_scale,
                )
            elif version == "v2":
-                ops.paged_attention_v2(
+                if not args.custom_paged_attn:
-                    output,
+                    ops.paged_attention_v2(
-                    exp_sums,
+                        output,
-                    max_logits,
+                        exp_sums,
-                    tmp_output,
+                        max_logits,
-                    query,
+                        tmp_output,
-                    key_cache,
+                        query,
-                    value_cache,
+                        key_cache,
-                    num_kv_heads,
+                        value_cache,
-                    scale,
+                        num_kv_heads,
-                    block_tables,
+                        scale,
-                    seq_lens,
+                        block_tables,
-                    block_size,
+                        seq_lens,
-                    max_seq_len,
+                        block_size,
-                    alibi_slopes,
+                        max_seq_len,
-                    kv_cache_dtype,
+                        alibi_slopes,
-                    k_scale,
+                        kv_cache_dtype,
-                    v_scale,
+                        k_scale,
-                )
+                        v_scale,
                    )
                else:
                    ops.paged_attention_rocm(
                        output,
                        exp_sums,
                        max_logits,
                        tmp_output,
                        query,
                        key_cache,
                        value_cache,
                        num_kv_heads,
                        scale,
                        block_tables,
                        seq_lens,
                        block_size,
                        max_seq_len,
                        alibi_slopes,
                        kv_cache_dtype,
                        k_scale,
                        v_scale,
                    )
            else:
                raise ValueError(f"Invalid version: {version}")
        torch.cuda.synchronize()
        end_time = time.perf_counter()
        if profile:
-            torch.cuda.cudart().cudaProfilerStart()
+            torch.cuda.cudart().cudaProfilerStop()
        return (end_time - start_time) / num_iters
    # Warmup.
@@ -165,6 +196,9 @@ def main(
 if __name__ == '__main__':
    logger.warning("This script benchmarks the paged attention kernel. "
                   "By default this is no longer used in vLLM inference.")
    parser = FlexibleArgumentParser(
        description="Benchmark the paged attention kernel.")
    parser.add_argument("--version",
@@ -195,6 +229,9 @@ if __name__ == '__main__':
        help="Data type for kv cache storage. If 'auto', will use model "
        "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
        "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)")
    parser.add_argument("--custom-paged-attn",
                        action="store_true",
                        help="Use custom paged attention")
    args = parser.parse_args()
    print(args)
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -40,7 +40,7 @@ def main(num_tokens: int,
        end_time = time.perf_counter()
        if profile:
-            torch.cuda.cudart().cudaProfilerStart()
+            torch.cuda.cudart().cudaProfilerStop()
        return (end_time - start_time) / num_iters
    # Warmup.
--- a/benchmarks/kernels/benchmark_rmsnorm.py
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 import itertools
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 import torch
 import triton
@@ -22,7 +22,7 @@ class HuggingFaceRMSNorm(nn.Module):
        self,
        x: torch.Tensor,
        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
        orig_dtype = x.dtype
        x = x.to(torch.float32)
        if residual is not None:
@@ -139,7 +139,7 @@ def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True):
    print(f"Naive output={output_naive}")
    print(f"FlashInfer output={output_flashinfer}")
-    print(f"VLLM output={output_vllm}")
+    print(f"vLLM output={output_vllm}")
    if torch.allclose(output_naive, output_flashinfer, atol=1e-2,
                      rtol=1e-2) and torch.allclose(
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 from itertools import accumulate
-from typing import List, Optional
+from typing import Optional
 import nvtx
 import torch
@@ -39,7 +39,7 @@ def benchmark_rope_kernels_multi_lora(
                            })
    # non-batched RoPE takes only one scaling factor, we create multiple
    # instances to simulate the same behavior
-    non_batched_ropes: List[RotaryEmbedding] = []
+    non_batched_ropes: list[RotaryEmbedding] = []
    for scaling_factor in scaling_factors:
        non_batched_ropes.append(
            get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
--- a/benchmarks/kernels/benchmark_shapes.py
+++ b/benchmarks/kernels/benchmark_shapes.py
@@ -75,3 +75,19 @@ WEIGHT_SHAPES = {
        [7168, 8192],
    ],
 }
 WEIGHT_SHAPES_MOE = {
    "nm-testing/Mixtral-8x7B-Instruct-v0.1": [
        [8, 2, 4096, 28672],
        [8, 2, 14336, 4096],
    ],
    "nm-testing/deepseekv2-lite": [
        [64, 6, 2048, 1408],
    ],
    "ibm-granite/granite-3.0-1b-a400m": [
        [32, 8, 1024, 1024],
    ],
    "ibm-granite/granite-3.0-3b-a800m": [
        [40, 8, 1024, 1536],
    ],
 }
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -0,0 +1,420 @@
 # SPDX-License-Identifier: Apache-2.0
 # Adapted from sglang quantization/tuning_block_wise_kernel.py
 import argparse
 import json
 import multiprocessing as mp
 import os
 import time
 from datetime import datetime
 from typing import Any
 import torch
 import tqdm
 import triton
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    _w8a8_block_fp8_matmul)
 from vllm.platforms import current_platform
 from vllm.utils import FlexibleArgumentParser
 mp.set_start_method("spawn", force=True)
 assert current_platform.is_cuda(
 ), "Only support tune w8a8 block fp8 kernel on CUDA device."
 DTYPE_MAP = {
    "float32": torch.float32,
    "float16": torch.float16,
    "half": torch.half,
    "bfloat16": torch.bfloat16,
 }
 def w8a8_block_matmul(
    A: torch.Tensor,
    B: torch.Tensor,
    As: torch.Tensor,
    Bs: torch.Tensor,
    block_size: list[int],
    config: dict[str, Any],
    output_dtype: torch.dtype = torch.float16,
 ) -> torch.Tensor:
    """This function performs matrix multiplication with 
    block-wise quantization.
    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
    The output is returned in the specified `output_dtype`.
    Args:
        A: The input tensor, e.g., activation.
        B: The input tensor, e.g., weight.
        As: The per-token-group quantization scale for `A`.
        Bs: The per-block quantization scale for `B`.
        block_size: The block size for per-block quantization. 
                    It should be 2-dim, e.g., [128, 128].
        output_dytpe: The dtype of the returned tensor.
    Returns:
        torch.Tensor: The result of matmul.
    """
    assert len(block_size) == 2
    block_n, block_k = block_size[0], block_size[1]
    assert A.shape[-1] == B.shape[-1]
    assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
    assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
    M = A.numel() // A.shape[-1]
    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
    N, K = B.shape
    assert triton.cdiv(N, block_n) == Bs.shape[0]
    assert triton.cdiv(K, block_k) == Bs.shape[1]
    C_shape = A.shape[:-1] + (N, )
    C = A.new_empty(C_shape, dtype=output_dtype)
    def grid(META):
        return (triton.cdiv(M, META["BLOCK_SIZE_M"]) *
                triton.cdiv(N, META["BLOCK_SIZE_N"]), )
    if A.dtype == torch.float8_e4m3fn:
        kernel = _w8a8_block_fp8_matmul
    else:
        raise RuntimeError(
            "Currently, only support tune w8a8 block fp8 kernel.")
    kernel[grid](
        A,
        B,
        C,
        As,
        Bs,
        M,
        N,
        K,
        block_n,
        block_k,
        A.stride(-2),
        A.stride(-1),
        B.stride(1),
        B.stride(0),
        C.stride(-2),
        C.stride(-1),
        As.stride(-2),
        As.stride(-1),
        Bs.stride(1),
        Bs.stride(0),
        **config,
    )
    return C
 def get_configs_compute_bound():
    configs = []
    for num_stages in [2, 3, 4, 5]:
        for block_m in [16, 32, 64, 128, 256]:
            for block_k in [64, 128]:
                for block_n in [32, 64, 128, 256]:
                    for num_warps in [4, 8]:
                        for group_size in [1, 16, 32, 64]:
                            configs.append({
                                "BLOCK_SIZE_M": block_m,
                                "BLOCK_SIZE_N": block_n,
                                "BLOCK_SIZE_K": block_k,
                                "GROUP_SIZE_M": group_size,
                                "num_warps": num_warps,
                                "num_stages": num_stages,
                            })
    return configs
 def get_weight_shapes(tp_size):
    # NOTE(HandH1998): The weight shapes only works for DeepSeek-V3.
    # Modify them, if you tune for another different model.
    # cannot TP
    total = [
        (512 + 64, 7168),
        ((128 + 64) * 128, 7168),
        (128 * (128 + 128), 512),
        (7168, 16384),
        (7168, 18432),
    ]
    # N can TP
    n_tp = [
        (18432 * 2, 7168),
        ((128 + 64) * 128, 7168),
        (128 * (128 + 128), 512),
        (24576, 1536),
        (12288, 7168),
        (4096, 7168),
    ]
    # K can TP
    k_tp = [(7168, 18432), (7168, 16384), (7168, 2048)]
    weight_shapes = []
    for t in total:
        weight_shapes.append(t)
    for n_t in n_tp:
        new_t = (n_t[0] // tp_size, n_t[1])
        weight_shapes.append(new_t)
    for k_t in k_tp:
        new_t = (k_t[0], k_t[1] // tp_size)
        weight_shapes.append(new_t)
    return weight_shapes
 def benchmark_config(A,
                     B,
                     As,
                     Bs,
                     block_size,
                     config,
                     out_dtype=torch.float16,
                     num_iters=10):
    def run():
        w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)
    torch.cuda.synchronize()
    # JIT complication & warmup
    for _ in range(5):
        run()
    torch.cuda.synchronize()
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)
    latencies: list[float] = []
    for i in range(num_iters):
        torch.cuda.synchronize()
        start_event.record()
        run()
        end_event.record()
        end_event.synchronize()
        latencies.append(start_event.elapsed_time(end_event))
    avg = sum(latencies) / (num_iters * 10) * 1000  # us
    return avg
 def tune(M, N, K, block_size, out_dtype, search_space, input_type):
    factor_for_scale = 1e-2
    if input_type == "fp8":
        fp8_info = torch.finfo(torch.float8_e4m3fn)
        fp8_max, fp8_min = fp8_info.max, fp8_info.min
        A_fp32 = (
            (torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 *
            fp8_max)
        A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
        B_fp32 = (
            (torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 *
            fp8_max)
        B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
    else:
        raise RuntimeError(
            "Currently, only support tune w8a8 block fp8 kernel.")
    block_n, block_k = block_size[0], block_size[1]
    n_tiles = (N + block_n - 1) // block_n
    k_tiles = (K + block_k - 1) // block_k
    As = torch.rand(M, k_tiles, dtype=torch.float32,
                    device="cuda") * factor_for_scale
    Bs = (torch.rand(n_tiles, k_tiles, dtype=torch.float32, device="cuda") *
          factor_for_scale)
    best_config = None
    best_time = float("inf")
    for config in tqdm(search_space):
        try:
            kernel_time = benchmark_config(
                A,
                B,
                As,
                Bs,
                block_size,
                config,
                out_dtype,
                num_iters=10,
            )
        except triton.runtime.autotuner.OutOfResources:
            # Some configurations may be invalid and fail to compile.
            continue
        if kernel_time < best_time:
            best_time = kernel_time
            best_config = config
    now = datetime.now()
    print(f"{now.ctime()}] Completed tuning for batch_size={M}")
    assert best_config is not None
    return best_config
 def save_configs(
    N,
    K,
    block_n,
    block_k,
    configs,
    save_path,
    input_type="fp8",
 ) -> None:
    os.makedirs(save_path, exist_ok=True)
    device_name = current_platform.get_device_name().replace(" ", "_")
    json_file_name = (
        f"N={N},K={K},device_name={device_name},dtype={input_type}_w8a8,"
        f"block_shape=[{block_n},{block_k}].json")
    config_file_path = os.path.join(save_path, json_file_name)
    print(f"Writing best config to {config_file_path}...")
    with open(config_file_path, "w") as f:
        json.dump(configs, f, indent=4)
        f.write("\n")
 def tune_on_gpu(args_dict):
    """Run tuning on a specific GPU."""
    gpu_id = args_dict["gpu_id"]
    batch_sizes = args_dict["batch_sizes"]
    weight_shapes = args_dict["weight_shapes"]
    args = args_dict["args"]
    torch.cuda.set_device(gpu_id)
    print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")
    block_n = args.block_n
    block_k = args.block_k
    out_dtype = DTYPE_MAP[args.out_dtype]
    save_path = args.save_path
    input_type = args.input_type
    search_space = get_configs_compute_bound()
    search_space = [
        config for config in search_space
        if block_k % config["BLOCK_SIZE_K"] == 0
    ]
    start = time.time()
    for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"):
        N, K = shape[0], shape[1]
        print(f"[GPU {gpu_id}] Tune for weight shape of `N: {N}, K: {K}`")
        benchmark_results = [
            tune(
                batch_size,
                N,
                K,
                [block_n, block_k],
                out_dtype,
                search_space,
                input_type,
            ) for batch_size in tqdm(batch_sizes,
                                     desc=f"GPU {gpu_id} - Batch sizes")
        ]
        best_configs = {
            M: config
            for M, config in zip(batch_sizes, benchmark_results)
        }
        save_configs(N, K, block_n, block_k, best_configs, save_path,
                     input_type)
    end = time.time()
    print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds")
 def distribute_batch_sizes(batch_sizes, num_gpus):
    """Distribute batch sizes across available GPUs."""
    batches_per_gpu = []
    for i in range(num_gpus):
        start_idx = i * len(batch_sizes) // num_gpus
        end_idx = (i + 1) * len(batch_sizes) // num_gpus
        batches_per_gpu.append(batch_sizes[start_idx:end_idx])
    return batches_per_gpu
 def main(args):
    print(args)
    num_gpus = torch.cuda.device_count()
    if num_gpus == 0:
        raise RuntimeError("No GPU available for tuning")
    print(f"Found {num_gpus} GPUs for parallel tuning")
    torch.cuda.init()
    if args.batch_size is None:
        batch_sizes = [
            1,
            2,
            4,
            8,
            16,
            24,
            32,
            48,
            64,
            96,
            128,
            256,
            512,
            1024,
            1536,
            2048,
            3072,
            4096,
        ]
    else:
        batch_sizes = [args.batch_size]
        num_gpus = 1  # If only one batch size, use only one GPU
    weight_shapes = get_weight_shapes(args.tp_size)
    batches_per_gpu = distribute_batch_sizes(batch_sizes, num_gpus)
    process_args = []
    for gpu_id in range(num_gpus):
        process_args.append({
            "gpu_id": gpu_id,
            "batch_sizes": batches_per_gpu[gpu_id],
            "weight_shapes":
            weight_shapes,  # Each GPU processes all weight shapes
            "args": args,
        })
    ctx = mp.get_context("spawn")
    with ctx.Pool(num_gpus) as pool:
        pool.map(tune_on_gpu, process_args)
    print("Multi-GPU tuning completed")
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description="""
 Tune triton w8a8 block fp8 for DeepSeek-V3/DeepSeek-R1:
    python3 benchmark_w8a8_block_fp8.py --tp-size 8 --input-type fp8
 Then copy to model_executor/layers/quantization/utils/configs
        """,
        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument("--tp-size", "-tp", type=int, default=8)
    parser.add_argument("--input-type",
                        type=str,
                        choices=["fp8"],
                        default="fp8")
    parser.add_argument(
        "--out-dtype",
        type=str,
        choices=["float32", "float16", "bfloat16", "half"],
        default="float16",
    )
    parser.add_argument("--block-n", type=int, default=128)
    parser.add_argument("--block-k", type=int, default=128)
    parser.add_argument("--batch-size", type=int, required=False)
    parser.add_argument("--save-path", type=str, default="./")
    args = parser.parse_args()
    main(args)
--- a/benchmarks/kernels/deepgemm/README.md
+++ b/benchmarks/kernels/deepgemm/README.md
@@ -0,0 +1,129 @@
 # DeepSeek DeepGEMM Kernels Benchmark
 This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.
 Currently this just includes dense GEMMs and only works on Hopper GPUs.
 ## Setup
 You need to install vLLM in your usual fashion, then install DeepGEMM from source in its own directory:
 ```
 git clone --recursive https://github.com/deepseek-ai/DeepGEMM
 cd DeepGEMM
 python setup.py install
 uv pip install -e .
 ```
 ## Usage
 ```
 python benchmark_fp8_block_dense_gemm.py
 INFO 02-26 21:55:13 [__init__.py:207] Automatically detected platform cuda.
 ===== STARTING FP8 GEMM BENCHMARK =====
 PyTorch version: 2.5.1+cu124
 CUDA version: 12.4
 Triton version: 3.1.0
 Using device: NVIDIA H100 80GB HBM3
 WARNING 02-26 21:55:15 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
 INFO 02-26 21:55:15 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
 WARNING 02-26 21:55:16 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=18432,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
 WARNING 02-26 21:55:17 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
 INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
 INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
 ===== PERFORMANCE COMPARISON =====
 DeepGEMM Implementation:
 +------+-------+-------+-----------+--------+--------+
 | m    | n     | k     | Time (μs) | TFLOPS | GB/s   |
 +------+-------+-------+-----------+--------+--------+
 |    8 |  4096 |  7168 | 102.9     | 4.6    | 286.4  |
 |    8 |  7168 | 18432 | 70.8      | 29.8   | 1868.8 |
 |    8 | 18432 |  7168 | 69.3      | 30.5   | 1911.8 |
 |   64 |  4096 |  7168 | 69.1      | 54.4   | 439.0  |
 |   64 |  7168 | 18432 | 69.4      | 243.6  | 1933.6 |
 |   64 | 18432 |  7168 | 70.4      | 240.3  | 1917.2 |
 |   64 | 24576 |  1536 | 70.1      | 68.9   | 584.6  |
 |   64 | 32768 |   512 | 68.4      | 31.4   | 307.1  |
 |   64 |  7168 | 16384 | 69.5      | 216.3  | 1718.5 |
 |  128 |  4096 |  7168 | 141.1     | 53.3   | 222.1  |
 |  128 |  7168 | 18432 | 71.9      | 470.5  | 1896.1 |
 |  128 | 18432 |  7168 | 69.3      | 488.2  | 1988.2 |
 | 1024 |  4096 |  7168 | 89.7      | 670.1  | 502.5  |
 | 1024 | 18432 |  7168 | 279.0     | 969.8  | 635.2  |
 | 2048 |  4096 |  7168 | 175.1     | 687.0  | 347.4  |
 | 4096 |  4096 |  7168 | 335.4     | 717.0  | 275.1  |
 +------+-------+-------+-----------+--------+--------+
 vLLM Triton Implementation:
 +------+-------+-------+-----------+--------+--------+--------------+
 | m    | n     | k     | Time (μs) | TFLOPS | GB/s   | vs DeepGEMM  |
 +------+-------+-------+-----------+--------+--------+--------------+
 |    8 |  4096 |  7168 | 74.0      | 6.3    | 398.2  | 1.39x faster |
 |    8 |  7168 | 18432 | 89.6      | 23.6   | 1478.1 | 0.79x slower |
 |    8 | 18432 |  7168 | 113.2     | 18.7   | 1170.4 | 0.61x slower |
 |   64 |  4096 |  7168 | 79.4      | 47.3   | 382.2  | 0.87x slower |
 |   64 |  7168 | 18432 | 98.5      | 171.7  | 1363.0 | 0.70x slower |
 |   64 | 18432 |  7168 | 119.5     | 141.5  | 1129.4 | 0.59x slower |
 |   64 | 24576 |  1536 | 37.6      | 128.4  | 1089.7 | 1.86x faster |
 |   64 | 32768 |   512 | 38.7      | 55.5   | 542.6  | 1.77x faster |
 |   64 |  7168 | 16384 | 86.1      | 174.5  | 1386.4 | 0.81x slower |
 |  128 |  4096 |  7168 | 90.7      | 82.9   | 345.4  | 1.56x faster |
 |  128 |  7168 | 18432 | 144.0     | 234.9  | 946.9  | 0.50x slower |
 |  128 | 18432 |  7168 | 229.5     | 147.4  | 600.1  | 0.30x slower |
 | 1024 |  4096 |  7168 | 242.3     | 248.2  | 186.1  | 0.37x slower |
 | 1024 | 18432 |  7168 | 897.8     | 301.4  | 197.4  | 0.31x slower |
 | 2048 |  4096 |  7168 | 463.0     | 259.7  | 131.4  | 0.38x slower |
 | 4096 |  4096 |  7168 | 901.8     | 266.7  | 102.3  | 0.37x slower |
 +------+-------+-------+-----------+--------+--------+--------------+
 vLLM CUTLASS Implementation:
 +------+-------+-------+-----------+--------+--------+--------------+--------------+
 | m    | n     | k     | Time (μs) | TFLOPS | GB/s   | vs DeepGEMM  | vs Triton    |
 +------+-------+-------+-----------+--------+--------+--------------+--------------+
 |    8 |  4096 |  7168 | 34.6      | 13.6   | 852.3  | 2.98x faster | 2.14x faster |
 |    8 |  7168 | 18432 | 78.9      | 26.8   | 1677.3 | 0.90x slower | 1.13x faster |
 |    8 | 18432 |  7168 | 81.2      | 26.0   | 1631.1 | 0.85x slower | 1.39x faster |
 |   64 |  4096 |  7168 | 36.9      | 101.9  | 822.9  | 1.87x faster | 2.15x faster |
 |   64 |  7168 | 18432 | 87.4      | 193.4  | 1535.2 | 0.79x slower | 1.13x faster |
 |   64 | 18432 |  7168 | 85.0      | 199.0  | 1587.6 | 0.83x slower | 1.41x faster |
 |   64 | 24576 |  1536 | 28.0      | 172.8  | 1465.8 | 2.51x faster | 1.35x faster |
 |   64 | 32768 |   512 | 28.8      | 74.5   | 728.5  | 2.37x faster | 1.34x faster |
 |   64 |  7168 | 16384 | 77.9      | 193.0  | 1532.8 | 0.89x slower | 1.11x faster |
 |  128 |  4096 |  7168 | 39.1      | 192.4  | 802.0  | 3.61x faster | 2.32x faster |
 |  128 |  7168 | 18432 | 93.7      | 360.8  | 1454.2 | 0.77x slower | 1.54x faster |
 |  128 | 18432 |  7168 | 85.7      | 394.8  | 1608.0 | 0.81x slower | 2.68x faster |
 | 1024 |  4096 |  7168 | 99.7      | 603.1  | 452.2  | 0.90x slower | 2.43x faster |
 | 1024 | 18432 |  7168 | 331.3     | 816.7  | 534.9  | 0.84x slower | 2.71x faster |
 | 2048 |  4096 |  7168 | 198.3     | 606.6  | 306.7  | 0.88x slower | 2.34x faster |
 | 4096 |  4096 |  7168 | 392.2     | 613.2  | 235.3  | 0.86x slower | 2.30x faster |
 +------+-------+-------+-----------+--------+--------+--------------+--------------+
 ===== AVERAGE PERFORMANCE =====
 +----------------+------------+----------+---------------+
 | Implementation | Avg TFLOPS | Avg GB/s | Avg Time (ms) |
 +----------------+------------+----------+---------------+
 | DeepGEMM       | 310.98     | 1052.10  | 0.11          |
 | vLLM Triton    | 144.30     | 715.60   | 0.23          |
 | vLLM CUTLASS   | 286.78     | 1076.67  | 0.11          |
 +----------------+------------+----------+---------------+
 ===== AVERAGE SPEEDUPS =====
 +-----------------------------+--------------+
 | Comparison                  | Speedup      |
 +-----------------------------+--------------+
 | DeepGEMM vs vLLM Triton     | 1.71x faster |
 | DeepGEMM vs vLLM CUTLASS    | 0.94x slower |
 | vLLM CUTLASS vs vLLM Triton | 1.84x faster |
 +-----------------------------+--------------+
 ===== ACCURACY COMPARISON =====
 +----------------+-----------------------+
 | Implementation | Avg Diff vs Reference |
 +----------------+-----------------------+
 | DeepGEMM       | 0.000684              |
 | vLLM Triton    | 0.000684              |
 | vLLM CUTLASS   | 0.000684              |
 +----------------+-----------------------+
 ```
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -0,0 +1,464 @@
 # SPDX-License-Identifier: Apache-2.0
 # fmt: off
 # ruff: noqa: E501
 import time
 # Import DeepGEMM functions
 import deep_gemm
 import torch
 import triton
 from deep_gemm import calc_diff, ceil_div, get_col_major_tma_aligned_tensor
 # Import vLLM functions
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    per_token_group_quant_fp8, w8a8_block_fp8_matmul)
 # Copied from
 # https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L9
 def per_token_cast_to_fp8(
        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
    """Convert tensor to FP8 format with per-token scaling."""
    assert x.dim() == 2 and x.size(1) % 128 == 0
    m, n = x.shape
    x_view = x.view(m, -1, 128)
    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
    return (x_view * (448.0 / x_amax.unsqueeze(2))).to(
        torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1)
 # Copied from
 # https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L17
 def per_block_cast_to_fp8(
        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
    """Convert tensor to FP8 format with per-block scaling."""
    assert x.dim() == 2
    m, n = x.shape
    x_padded = torch.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128),
                           dtype=x.dtype,
                           device=x.device)
    x_padded[:m, :n] = x
    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
        x_amax / 448.0).view(x_view.size(0), x_view.size(2))
 def benchmark_shape(m: int,
                    n: int,
                    k: int,
                    warmup: int = 100,
                    repeat: int = 10000,
                    verbose: bool = False) -> dict:
    """Benchmark all implementations for a specific (m, n, k) shape."""
    if verbose:
        print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
    # Create test tensors
    A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16)
    B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16)
    # Reference result in BF16
    torch.cuda.synchronize()
    C_ref = A @ B.t()
    # Pre-quantize B for all implementations
    # (weights can be pre-quantized offline)
    B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B)
    B_vllm, B_scale_vllm = per_block_cast_to_fp8(B)
    # Block size configuration
    block_size = [128, 128]
    # Pre-quantize A for all implementations
    A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A)
    A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
    C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
    A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
    A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
        A, block_size[1], column_major_scales=True)
    # === DeepGEMM Implementation ===
    def deepgemm_gemm():
        # A quantization is inside the loop as it depends on activations
        # A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A)
        # A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(
        #     A, block_size[1])
        # A_scale_aligned = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
        # C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
        deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm),
                                       (B_deepgemm, B_scale_deepgemm),
                                       C_deepgemm)
        return C_deepgemm
    # === vLLM Triton Implementation ===
    def vllm_triton_gemm():
        # A quantization is inside the loop as it depends on activations
        # A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
        return w8a8_block_fp8_matmul(A_vllm,
                                     B_vllm,
                                     A_scale_vllm,
                                     B_scale_vllm,
                                     block_size,
                                     output_dtype=torch.bfloat16)
    # === vLLM CUTLASS Implementation ===
    def vllm_cutlass_gemm():
        # A quantization is inside the loop as it depends on activations
        # A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
        #     A, block_size[1], column_major_scales=True)
        return ops.cutlass_scaled_mm(A_vllm_cutlass,
                                     B_vllm.T,
                                     scale_a=A_scale_vllm_cutlass,
                                     scale_b=B_scale_vllm.T,
                                     out_dtype=torch.bfloat16)
    # Run correctness check first
    if verbose:
        print("Running correctness check...")
    C_deepgemm = deepgemm_gemm()
    C_vllm_triton = vllm_triton_gemm()
    C_vllm_cutlass = vllm_cutlass_gemm()
    deepgemm_diff = calc_diff(C_deepgemm, C_ref)
    vllm_triton_diff = calc_diff(C_vllm_triton, C_ref)
    vllm_cutlass_diff = calc_diff(C_vllm_cutlass, C_ref)
    if verbose:
        print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
        print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
        print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
        print("vLLM Triton vs DeepGEMM difference: "
              f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}")
        print("vLLM CUTLASS vs DeepGEMM difference: "
              f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}")
    # Benchmark implementations
    implementations = {
        "DeepGEMM": deepgemm_gemm,
        "vLLM Triton": vllm_triton_gemm,
        "vLLM CUTLASS": vllm_cutlass_gemm
    }
    benchmark_results = {
        "shape": {
            "m": m,
            "n": n,
            "k": k
        },
        "implementations": {}
    }
    for name, func in implementations.items():
        # Warmup
        for _ in range(warmup):
            func()
            torch.cuda.synchronize()
        # Timing loop
        torch.cuda.synchronize()
        start = time.time()
        for _ in range(repeat):
            func()
        torch.cuda.synchronize()
        end = time.time()
        # Calculate timing and TFLOPS
        avg_time_ms = (end - start) / repeat * 1000
        avg_time_us = avg_time_ms * 1000
        tflops = 2 * m * n * k / (avg_time_ms * 1e-3) / 1e12
        gb_s = (m * k + k * n + m * n * 2) / 1e9 / (avg_time_ms * 1e-3)
        benchmark_results["implementations"][name] = {
            "time_ms": avg_time_ms,
            "time_us": avg_time_us,
            "tflops": tflops,
            "gb_s": gb_s,
            "diff": {
                "DeepGEMM":
                0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm),
                "Reference":
                deepgemm_diff if name == "DeepGEMM" else
                (vllm_triton_diff
                 if name == "vLLM Triton" else vllm_cutlass_diff)
            }
        }
        if verbose:
            print(
                f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s"
            )
    # Calculate speedups
    baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
    for name, data in benchmark_results["implementations"].items():
        if name != "DeepGEMM":
            speedup = baseline / data["time_ms"]
            benchmark_results["implementations"][name][
                "speedup_vs_deepgemm"] = speedup
            if verbose:
                print(f"DeepGEMM is {1/speedup:.2f}x "
                      f"{'faster' if 1/speedup > 1 else 'slower'} than {name}")
    vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][
        "time_ms"]
    vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][
        "time_ms"]
    cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
    benchmark_results["implementations"]["vLLM CUTLASS"][
        "speedup_vs_triton"] = cutlass_vs_triton
    if verbose:
        print(
            f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
            f"{'faster' if cutlass_vs_triton > 1 else 'slower'} than vLLM Triton"
        )
    return benchmark_results
 def format_table_row(values, widths):
    """Format a row with specified column widths."""
    return "| " + " | ".join(f"{val:{w}}"
                             for val, w in zip(values, widths)) + " |"
 def print_table(headers, rows, title=None):
    """Print a table with headers and rows."""
    if title:
        print(f"\n{title}")
    # Calculate column widths based on headers and data
    widths = [
        max(len(str(h)), max(len(str(row[i])) for row in rows))
        for i, h in enumerate(headers)
    ]
    # Create separator line
    separator = "+-" + "-+-".join("-" * w for w in widths) + "-+"
    # Print table
    print(separator)
    print(format_table_row(headers, widths))
    print(separator)
    for row in rows:
        print(format_table_row(row, widths))
    print(separator)
 def format_speedup(value):
    """Format speedup value with indicator if it's faster or slower."""
    return f"{value:.2f}x {'faster' if value > 1.0 else 'slower'}"
 def run_benchmarks(verbose: bool = False):
    """Run benchmarks for a set of common shapes."""
    print("===== STARTING FP8 GEMM BENCHMARK =====")
    # Make sure we're using the GPU
    if not torch.cuda.is_available():
        print("CUDA not available! Tests require GPU.")
        return
    # Print system information
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Triton version: {triton.__version__}")
    print(f"Using device: {torch.cuda.get_device_name()}")
    # Enable TF32 for better performance
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    # Set seeds for reproducibility
    torch.manual_seed(42)
    torch.cuda.manual_seed(42)
    # Define benchmark shapes (m, n, k)
    shapes = [
        (8, 4096, 7168),
        (8, 7168, 18432),
        (8, 18432, 7168),
        (64, 4096, 7168),
        (64, 7168, 18432),
        (64, 18432, 7168),
        (64, 24576, 1536),
        (64, 32768, 512),
        (64, 7168, 16384),
        (128, 4096, 7168),
        (128, 7168, 18432),
        (128, 18432, 7168),
        (1024, 4096, 7168),
        (1024, 18432, 7168),
        (2048, 4096, 7168),
        (4096, 4096, 7168),
    ]
    shapes = [
        # (64, 2112, 7168),
        (64, 24576, 1536),
        (64, 32768, 512),
        (64, 7168, 16384),
        (64, 4096, 7168),
        (64, 7168, 2048),
        # (128, 2112, 7168),
        (128, 24576, 1536),
        (128, 32768, 512),
        (128, 7168, 16384),
        (128, 4096, 7168),
        (128, 7168, 2048),
        # (4096, 2112, 7168),
        (4096, 24576, 1536),
        (4096, 32768, 512),
        (4096, 7168, 16384),
        (4096, 4096, 7168),
        (4096, 7168, 2048),
    ]
    all_results = []
    for m, n, k in shapes:
        result = benchmark_shape(m, n, k, verbose=verbose)
        all_results.append(result)
    # Print results in a nicely formatted table
    print("\n===== PERFORMANCE COMPARISON =====")
    # Print DeepGEMM table
    deepgemm_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s"]
    deepgemm_rows = []
    for result in all_results:
        shape = result["shape"]
        impl_data = result["implementations"]["DeepGEMM"]
        deepgemm_rows.append([
            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}"
        ])
    print_table(deepgemm_headers,
                deepgemm_rows,
                title="DeepGEMM Implementation:")
    # Print vLLM Triton table
    triton_headers = [
        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"
    ]
    triton_rows = []
    for result in all_results:
        shape = result["shape"]
        impl_data = result["implementations"]["vLLM Triton"]
        speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
        triton_rows.append([
            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
            format_speedup(speedup)
        ])
    print_table(triton_headers,
                triton_rows,
                title="vLLM Triton Implementation:")
    # Print vLLM CUTLASS table
    cutlass_headers = [
        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM",
        "vs Triton"
    ]
    cutlass_rows = []
    for result in all_results:
        shape = result["shape"]
        impl_data = result["implementations"]["vLLM CUTLASS"]
        vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
        vs_triton = impl_data.get("speedup_vs_triton", 1.0)
        cutlass_rows.append([
            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
            format_speedup(vs_deepgemm),
            format_speedup(vs_triton)
        ])
    print_table(cutlass_headers,
                cutlass_rows,
                title="vLLM CUTLASS Implementation:")
    # Calculate and print averages
    print("\n===== AVERAGE PERFORMANCE =====")
    implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
    avg_metrics = {
        impl: {
            "tflops": 0,
            "gb_s": 0,
            "time_ms": 0
        }
        for impl in implementations
    }
    for result in all_results:
        for impl in implementations:
            impl_data = result["implementations"][impl]
            avg_metrics[impl]["tflops"] += impl_data["tflops"]
            avg_metrics[impl]["gb_s"] += impl_data["gb_s"]
            avg_metrics[impl]["time_ms"] += impl_data["time_ms"]
    num_shapes = len(all_results)
    avg_headers = ["Implementation", "Avg TFLOPS", "Avg GB/s", "Avg Time (ms)"]
    avg_rows = []
    for impl in implementations:
        avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
        avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
        avg_time = avg_metrics[impl]["time_ms"] / num_shapes
        avg_rows.append([
            impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"
        ])
    print_table(avg_headers, avg_rows)
    # Calculate average speedups
    avg_speedups = {
        "DeepGEMM vs vLLM Triton": 0,
        "DeepGEMM vs vLLM CUTLASS": 0,
        "vLLM CUTLASS vs vLLM Triton": 0
    }
    for result in all_results:
        deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
        vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
        vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][
            "time_ms"]
        avg_speedups[
            "DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
        avg_speedups[
            "DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
        avg_speedups[
            "vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time
    print("\n===== AVERAGE SPEEDUPS =====")
    speedup_headers = ["Comparison", "Speedup"]
    speedup_rows = []
    for comparison, total in avg_speedups.items():
        avg_speedup = total / num_shapes
        status = "faster" if avg_speedup > 1 else "slower"
        speedup_rows.append([comparison, f"{avg_speedup:.2f}x {status}"])
    print_table(speedup_headers, speedup_rows)
    # Average accuracy comparison
    print("\n===== ACCURACY COMPARISON =====")
    avg_diff = {impl: 0 for impl in implementations}
    for result in all_results:
        for impl in implementations:
            avg_diff[impl] += result["implementations"][impl]["diff"][
                "Reference"]
    diff_headers = ["Implementation", "Avg Diff vs Reference"]
    diff_rows = []
    for impl in implementations:
        diff_rows.append([impl, f"{avg_diff[impl] / num_shapes:.6f}"])
    print_table(diff_headers, diff_rows)
 if __name__ == "__main__":
    run_benchmarks(verbose=False)
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@@ -4,7 +4,6 @@ import math
 import pickle
 import re
 from collections import defaultdict
 from typing import List
 import matplotlib.pyplot as plt
 import pandas as pd
@@ -23,7 +22,7 @@ if __name__ == "__main__":
    with open(args.filename, 'rb') as f:
        data = pickle.load(f)
-        raw_results: List[TMeasurement] = data["results"]
+        raw_results: list[TMeasurement] = data["results"]
    results = defaultdict(lambda: list())
    for v in raw_results:
--- a/benchmarks/kernels/utils.py
+++ b/benchmarks/kernels/utils.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 import dataclasses
-from typing import Any, Callable, Iterable, Optional
+from collections.abc import Iterable
 from typing import Any, Callable, Optional
 import torch
 import torch.utils.benchmark as TBenchmark
--- a/benchmarks/launch_tgi_server.sh
+++ b/benchmarks/launch_tgi_server.sh
@@ -1,16 +0,0 @@
 #!/bin/bash
 PORT=8000
 MODEL=$1
 TOKENS=$2
 docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \
           -v "$PWD/data:/data" \
           ghcr.io/huggingface/text-generation-inference:2.2.0 \
           --model-id "$MODEL" \
           --sharded false  \
           --max-input-length 1024 \
           --max-total-tokens 2048 \
           --max-best-of 5 \
           --max-concurrent-requests 5000 \
           --max-batch-total-tokens "$TOKENS"
--- a/benchmarks/run_structured_output_benchmark.sh
+++ b/benchmarks/run_structured_output_benchmark.sh
@@ -0,0 +1,65 @@
 #!/bin/bash
 # Define the model to use
 MODEL=${1:-"Qwen/Qwen2.5-7B-Instruct"}
 # Define the backend to use
 BACKEND=${2:-"vllm"}
 # Define the dataset to use
 DATASET=${3:-"xgrammar_bench"}
 # Define the guided decoding backend
 GUIDED_BACKEND=${4:-"xgrammar"}
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 OUTPUT_DIR=${5:-"$SCRIPT_DIR/structured_output_benchmark_results"}
 GUIDED_RATIO=${6:-0.5}
 # Create output directory if it doesn't exist
 mkdir -p "$OUTPUT_DIR"
 # Define QPS values to test
 QPS_VALUES=(70 60 50 25 20 15 10)
 # Common parameters
 COMMON_PARAMS="--backend $BACKEND \
               --model $MODEL \
               --dataset $DATASET \
               --structured-output-backend $GUIDED_BACKEND \
               --structured-output-ratio $GUIDED_RATIO \
               --save-results \
               --result-dir $OUTPUT_DIR"
 echo "Starting structured output benchmark with model: $MODEL"
 echo "Backend: $BACKEND"
 echo "Dataset: $DATASET"
 echo "Structured output backend: $GUIDED_BACKEND"
 echo "Results will be saved to: $OUTPUT_DIR"
 echo "----------------------------------------"
 # Run benchmarks with different QPS values
 for qps in "${QPS_VALUES[@]}"; do
  echo "Running benchmark with QPS: $qps"
  # Get git hash and branch for the filename
  GIT_HASH=$(git rev-parse --short HEAD 2>/dev/null || echo "unknown")
  GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")
  # Construct filename for this run
  FILENAME="${GUIDED_BACKEND}_${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json"
  # Run the benchmark
  python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
    --request-rate $qps \
    --result-filename "$FILENAME" \
    --tokenizer-mode ${TOKENIZER_MODE:-"auto"} \
    --port ${PORT:-8000}
  echo "Completed benchmark with QPS: $qps"
  echo "----------------------------------------"
 done
 echo "All benchmarks completed!"
 echo "Results saved to: $OUTPUT_DIR"
--- a/benchmarks/structured_schemas/structured_schema_1.json
+++ b/benchmarks/structured_schemas/structured_schema_1.json
@@ -1,113 +1,19 @@
 {
-    "$schema":
+    "type": "object",
    "https://json-schema.org/draft/2020-12/schema",
    "title":
    "User Profile",
    "type":
    "object",
    "properties": {
-        "userId": {
+      "name": { "type": "string" },
-            "type": "string",
+      "email": { "type": "string" },
-            "description": "Unique identifier for the user."
+      "street": { "type": "string" },
-        },
+      "city": { "type": "string" },
-        "personalInfo": {
+      "state": { "type": "string" },
-            "type": "object",
+      "zip": { "type": "string" },
-            "properties": {
+      "phone": { "type": "string" },
-                "firstName": {
+      "website": { "type": "string" },
-                    "type": "string",
+      "company": { "type": "string" },
-                    "description": "The user's first name."
+      "age": { "type": "integer" }
                },
                "lastName": {
                    "type": "string",
                    "description": "The user's last name."
                },
                "age": {
                    "type": "integer",
                    "minimum": 0,
                    "description": "The user's age."
                },
                "phoneNumbers": {
                    "type":
                    "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "type": {
                                "type": "string",
                                "enum": ["home", "work", "mobile"],
                                "description": "Type of phone number."
                            },
                            "number": {
                                "type": "string",
                                "pattern": "^\\+?[1-9]\\d{1,14}$",
                                "description": "Phone number in E.164 format."
                            }
                        },
                        "required": ["type", "number"]
                    },
                    "description":
                    "List of phone numbers associated with the user."
                }
            },
            "required": ["firstName", "lastName"]
        },
        "address": {
            "type": "object",
            "properties": {
                "street": {
                    "type": "string",
                    "description": "Street address."
                },
                "city": {
                    "type": "string",
                    "description": "City name."
                },
                "state": {
                    "type": "string",
                    "description": "State or province."
                },
                "postalCode": {
                    "type": "string",
                    "pattern": "^\\d{5}(-\\d{4})?$",
                    "description": "Postal code."
                },
                "country": {
                    "type": "string",
                    "description": "Country name."
                }
            },
            "required": ["street", "city", "state", "postalCode", "country"]
        },
        "preferences": {
            "type": "object",
            "properties": {
                "newsletterSubscribed": {
                    "type":
                    "boolean",
                    "description":
                    "Indicates if the user is subscribed to the newsletter."
                },
                "favoriteCategories": {
                    "type": "array",
                    "items": {
                        "type": "string"
                    },
                    "description": "List of user's favorite categories."
                }
            },
            "required": ["newsletterSubscribed"]
        },
        "accountStatus": {
            "type": "string",
            "enum": ["active", "inactive", "suspended"],
            "description": "Current status of the user's account."
        },
        "registrationDate": {
            "type": "string",
            "format": "date-time",
            "description": "ISO 8601 formatted date-time of user registration."
        }
    },
-    "required":
+    "required": [
-    ["userId", "personalInfo", "address", "accountStatus", "registrationDate"]
+      "name",
      "email"
    ]
 }
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -33,8 +33,6 @@ endif()
 if(MACOSX_FOUND)
    list(APPEND CXX_COMPILE_FLAGS
        "-Xpreprocessor"
        "-fopenmp"
        "-DVLLM_CPU_EXTENSION")
 else()
    list(APPEND CXX_COMPILE_FLAGS
@@ -81,6 +79,7 @@ else()
    find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
    find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
    find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
    find_isa(${CPUINFO} "S390" S390_FOUND)
 endif()
@@ -129,8 +128,16 @@ elseif (ASIMD_FOUND)
 elseif(APPLE_SILICON_FOUND)
    message(STATUS "Apple Silicon Detected")
    set(ENABLE_NUMA OFF)
 elseif (S390_FOUND)
    message(STATUS "S390 detected")
    # Check for S390 VXE support
    list(APPEND CXX_COMPILE_FLAGS
        "-mvx"
        "-mzvector"
        "-march=native"
        "-mtune=native")
 else()
-    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
+    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA or ARMv8 support.")
 endif()
 #
@@ -140,7 +147,7 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
    FetchContent_Declare(
        oneDNN
        GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-        GIT_TAG  v3.6
+        GIT_TAG  v3.7.1
        GIT_PROGRESS TRUE
        GIT_SHALLOW TRUE
    )
@@ -181,12 +188,14 @@ set(VLLM_EXT_SRC
    "csrc/cpu/cache.cpp"
    "csrc/cpu/utils.cpp"
    "csrc/cpu/layernorm.cpp"
    "csrc/cpu/mla_decode.cpp"
    "csrc/cpu/pos_encoding.cpp"
    "csrc/cpu/torch_bindings.cpp")
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
    set(VLLM_EXT_SRC
        "csrc/cpu/quant.cpp"
        "csrc/cpu/shm.cpp"
        ${VLLM_EXT_SRC})
 endif()
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@@ -0,0 +1,66 @@
 include(FetchContent)
 # If FLASH_MLA_SRC_DIR is set, flash-mla is installed from that directory 
 # instead of downloading.
 # It can be set as an environment variable or passed as a cmake argument.
 # The environment variable takes precedence.
 if (DEFINED ENV{FLASH_MLA_SRC_DIR})
  set(FLASH_MLA_SRC_DIR $ENV{FLASH_MLA_SRC_DIR})
 endif()
 if(FLASH_MLA_SRC_DIR)
  FetchContent_Declare(
        flashmla 
        SOURCE_DIR ${FLASH_MLA_SRC_DIR}
        CONFIGURE_COMMAND ""
        BUILD_COMMAND ""
  )
 else()
  FetchContent_Declare(
        flashmla
        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
        GIT_TAG 575f7724b9762f265bbee5889df9c7d630801845
        GIT_PROGRESS TRUE
        CONFIGURE_COMMAND ""
        BUILD_COMMAND ""
  )
 endif()
 FetchContent_MakeAvailable(flashmla)
 message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
 # The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
 # Only build FlashMLA kernels if we are building for something compatible with 
 # sm90a
 cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
 if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
    set(FlashMLA_SOURCES
        ${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
        ${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_bf16_sm90.cu
        ${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_fp16_sm90.cu
        ${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_metadata.cu)
    set(FlashMLA_INCLUDES
        ${flashmla_SOURCE_DIR}/csrc/cutlass/include
        ${flashmla_SOURCE_DIR}/csrc/include)
    set_gencode_flags_for_srcs(
        SRCS "${FlashMLA_SOURCES}"
        CUDA_ARCHS "${FLASH_MLA_ARCHS}")
    define_gpu_extension_target(
        _flashmla_C
        DESTINATION vllm
        LANGUAGE ${VLLM_GPU_LANG}
        SOURCES ${FlashMLA_SOURCES}
        COMPILE_FLAGS ${VLLM_GPU_FLAGS}
        ARCHITECTURES ${VLLM_GPU_ARCHES}
        INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
        USE_SABI 3
        WITH_SOABI)
 else()
    # Create an empty target for setup.py when not targeting sm90a systems
    add_custom_target(_flashmla_C)
 endif()
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -0,0 +1,67 @@
 # vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
 # arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
 # arches in the CUDA case (and instead set the gencodes on a per file basis)
 # we need to manually set VLLM_GPU_ARCHES here.
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  foreach(_ARCH ${CUDA_ARCHS})
    string(REPLACE "." "" _ARCH "${_ARCH}")
    list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
  endforeach()
 endif()
 #
 # Build vLLM flash attention from source
 #
 # IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
 # Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
 # They should be identical but if they aren't, this is a massive footgun.
 #
 # The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
 # To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).
 # If no component is specified, vllm-flash-attn is still installed.
 # If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
 # This is to enable local development of vllm-flash-attn within vLLM.
 # It can be set as an environment variable or passed as a cmake argument.
 # The environment variable takes precedence.
 if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
  set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
 endif()
 if(VLLM_FLASH_ATTN_SRC_DIR)
  FetchContent_Declare(
          vllm-flash-attn SOURCE_DIR 
          ${VLLM_FLASH_ATTN_SRC_DIR}
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
  )
 else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
          GIT_TAG dc9d410b3e2d6534a4c70724c2515f4def670a22
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
  )
 endif()
 # Fetch the vllm-flash-attn library
 FetchContent_MakeAvailable(vllm-flash-attn)
 message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
 # Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
 # case only one is built, in the case both are built redundant work is done)
 install(
  DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
  DESTINATION vllm_flash_attn
  COMPONENT _vllm_fa2_C
  FILES_MATCHING PATTERN "*.py"
 )
 install(
  DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
  DESTINATION vllm_flash_attn
  COMPONENT _vllm_fa3_C
  FILES_MATCHING PATTERN "*.py"
 )
--- a/collect_env.py
+++ b/collect_env.py
@@ -105,8 +105,14 @@ def run(command):
    else:
        enc = locale.getpreferredencoding()
    output = raw_output.decode(enc)
    if command == 'nvidia-smi topo -m':
        # don't remove the leading whitespace of `nvidia-smi topo -m`
        #   because they are meaningful
        output = output.rstrip()
    else:
        output = output.strip()
    err = raw_err.decode(enc)
-    return rc, output.strip(), err.strip()
+    return rc, output, err.strip()
 def run_and_read_all(run_lambda, command):
@@ -482,16 +488,28 @@ def get_pip_packages(run_lambda, patterns=None):
    if patterns is None:
        patterns = DEFAULT_PIP_PATTERNS
-    # People generally have `pip` as `pip` or `pip3`
+    def run_with_pip():
-    # But here it is invoked as `python -mpip`
+        try:
-    def run_with_pip(pip):
+            import importlib.util
-        out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"])
+            pip_spec = importlib.util.find_spec('pip')
            pip_available = pip_spec is not None
        except ImportError:
            pip_available = False
        if pip_available:
            cmd = [sys.executable, '-mpip', 'list', '--format=freeze']
        elif os.environ.get("UV") is not None:
            print("uv is set")
            cmd = ["uv", "pip", "list", "--format=freeze"]
        else:
            raise RuntimeError("Could not collect pip list output (pip or uv module not available)")
        out = run_and_read_all(run_lambda, cmd)
        return "\n".join(line for line in out.splitlines()
                         if any(name in line for name in patterns))
    pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
-    out = run_with_pip([sys.executable, '-mpip'])
+    out = run_with_pip()
    return pip_version, out
--- a/csrc/attention/merge_attn_states.cu
+++ b/csrc/attention/merge_attn_states.cu
@@ -0,0 +1,173 @@
 #include <optional>
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <algorithm>
 #include "attention_dtypes.h"
 #include "attention_utils.cuh"
 namespace vllm {
 // Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
 // can be used to combine partial attention results (in the split-KV case)
 template <typename scalar_t, const uint NUM_THREADS>
 __global__ void merge_attn_states_kernel(
    scalar_t* output, float* output_lse, const scalar_t* prefix_output,
    const float* prefix_lse, const scalar_t* suffix_output,
    const float* suffix_lse, const uint num_tokens, const uint num_heads,
    const uint head_size) {
  using pack_128b_t = uint4;
  const uint pack_size = 16 / sizeof(scalar_t);
  const uint threads_per_head = head_size / pack_size;
  const uint global_idx = blockIdx.x * NUM_THREADS + threadIdx.x;
  const uint token_head_threads = num_tokens * num_heads * threads_per_head;
  if (global_idx >= token_head_threads) return;
  // global_idx -> token_idx + head_idx + pack_idx
  const uint token_head_idx = global_idx / threads_per_head;
  const uint pack_idx = global_idx % threads_per_head;
  const uint token_idx = token_head_idx / num_heads;
  const uint head_idx = token_head_idx % num_heads;
  const uint pack_offset = pack_idx * pack_size;  // (0~15)*8, etc.
  const uint head_offset =
      token_idx * num_heads * head_size + head_idx * head_size;
  const scalar_t* prefix_head_ptr = prefix_output + head_offset;
  const scalar_t* suffix_head_ptr = suffix_output + head_offset;
  scalar_t* output_head_ptr = output + head_offset;
  float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
  float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
  p_lse = std::isinf(p_lse) ? -std::numeric_limits<float>::infinity() : p_lse;
  s_lse = std::isinf(s_lse) ? -std::numeric_limits<float>::infinity() : s_lse;
  const float max_lse = fmaxf(p_lse, s_lse);
  p_lse = p_lse - max_lse;
  s_lse = s_lse - max_lse;
  const float p_se = expf(p_lse);
  const float s_se = expf(s_lse);
  const float out_se = p_se + s_se;
  const float p_scale = p_se / out_se;
  const float s_scale = s_se / out_se;
  if (pack_offset < head_size) {
    // Pack 128b load
    pack_128b_t p_out_pack = reinterpret_cast<const pack_128b_t*>(
        prefix_head_ptr)[pack_offset / pack_size];
    pack_128b_t s_out_pack = reinterpret_cast<const pack_128b_t*>(
        suffix_head_ptr)[pack_offset / pack_size];
    pack_128b_t o_out_pack;
 #pragma unroll
    for (uint i = 0; i < pack_size; ++i) {
      // Always use float for FMA to keep high precision.
      // half(uint16_t), bfloat16, float -> float.
      const float p_out_f =
          vllm::to_float(reinterpret_cast<const scalar_t*>(&p_out_pack)[i]);
      const float s_out_f =
          vllm::to_float(reinterpret_cast<const scalar_t*>(&s_out_pack)[i]);
      // fma: a * b + c = p_out_f * p_scale + (s_out_f * s_scale)
      const float o_out_f = p_out_f * p_scale + (s_out_f * s_scale);
      // float -> half(uint16_t), bfloat16, float.
      vllm::from_float(reinterpret_cast<scalar_t*>(&o_out_pack)[i], o_out_f);
    }
    // Pack 128b storage
    reinterpret_cast<pack_128b_t*>(output_head_ptr)[pack_offset / pack_size] =
        o_out_pack;
  }
  // We only need to write to output_lse once per head.
  if (output_lse != nullptr && pack_idx == 0) {
    float out_lse = logf(out_se) + max_lse;
    output_lse[head_idx * num_tokens + token_idx] = out_lse;
  }
 }
 }  // namespace vllm
 // The following macro is used to dispatch the conversion function based on
 // the output data type. The FN is a macro that calls a function with
 // template<typename scalar_t>.
 #define DISPATCH_BY_SCALAR_DTYPE(scalar_dtype, fn)                      \
  {                                                                     \
    if (scalar_dtype == at::ScalarType::Float) {                        \
      fn(float);                                                        \
    } else if (scalar_dtype == at::ScalarType::Half) {                  \
      fn(uint16_t);                                                     \
    } else if (scalar_dtype == at::ScalarType::BFloat16) {              \
      fn(__nv_bfloat16);                                                \
    } else {                                                            \
      TORCH_CHECK(false, "Unsupported data type of O: ", scalar_dtype); \
    }                                                                   \
  }
 #define LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS)                     \
  {                                                                         \
    vllm::merge_attn_states_kernel<scalar_t, NUM_THREADS><<<grid, block>>>( \
        reinterpret_cast<scalar_t*>(output.data_ptr()), output_lse_ptr,     \
        reinterpret_cast<scalar_t*>(prefix_output.data_ptr()),              \
        reinterpret_cast<float*>(prefix_lse.data_ptr()),                    \
        reinterpret_cast<scalar_t*>(suffix_output.data_ptr()),              \
        reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens,        \
        num_heads, head_size);                                              \
  }
 /*@brief Merges the attention states from prefix and suffix
 * into the output tensor. NUM_TOKENS: n, NUM_HEADS: h, HEAD_SIZE: d
 *
 * @param output [n,h,d] The output tensor to store the merged attention states.
 * @param output_lse [h,d] Optional tensor to store the log-sum-exp values.
 * @param prefix_output [n,h,d] The prefix attention states.
 * @param prefix_lse [h,d] The log-sum-exp values for the prefix attention
 * states.
 * @param suffix_output [n,h,d] The suffix attention states.
 * @param suffix_lse [h,d] The log-sum-exp values for the suffix attention
 * states.
 */
 template <typename scalar_t>
 void merge_attn_states_launcher(torch::Tensor& output,
                                std::optional<torch::Tensor> output_lse,
                                const torch::Tensor& prefix_output,
                                const torch::Tensor& prefix_lse,
                                const torch::Tensor& suffix_output,
                                const torch::Tensor& suffix_lse) {
  constexpr uint NUM_THREADS = 128;
  const uint num_tokens = output.size(0);
  const uint num_heads = output.size(1);
  const uint head_size = output.size(2);
  const uint pack_size = 16 / sizeof(scalar_t);
  TORCH_CHECK(head_size % pack_size == 0,
              "headsize must be multiple of pack_size:", pack_size);
  float* output_lse_ptr = nullptr;
  if (output_lse.has_value()) {
    output_lse_ptr = output_lse.value().data_ptr<float>();
  }
  // process one pack elements per thread. float -> 4, half/bf16 -> 8
  const uint threads_per_head = head_size / pack_size;
  const uint total_threads = num_tokens * num_heads * threads_per_head;
  dim3 block(NUM_THREADS);
  dim3 grid((total_threads + NUM_THREADS - 1) / NUM_THREADS);
  LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS);
 }
 #define CALL_MERGE_ATTN_STATES_LAUNCHER(scalar_t)                           \
  {                                                                         \
    merge_attn_states_launcher<scalar_t>(output, output_lse, prefix_output, \
                                         prefix_lse, suffix_output,         \
                                         suffix_lse);                       \
  }
 void merge_attn_states(torch::Tensor& output,
                       std::optional<torch::Tensor> output_lse,
                       const torch::Tensor& prefix_output,
                       const torch::Tensor& prefix_lse,
                       const torch::Tensor& suffix_output,
                       const torch::Tensor& suffix_lse) {
  DISPATCH_BY_SCALAR_DTYPE(output.dtype(), CALL_MERGE_ATTN_STATES_LAUNCHER);
 }
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -39,3 +39,10 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
 // Just for unittest
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                 const double scale, const std::string& kv_cache_dtype);
 void gather_cache(
    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
    int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -2,6 +2,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include "cuda_utils.h"
 #include "cuda_compat.h"
 #include "dispatch_utils.h"
@@ -349,8 +350,8 @@ __global__ void concat_and_cache_mla_kernel(
 }  // namespace vllm
-// KV_T is the stored data type of kv-cache.
+// KV_T is the data type of key and value tensors.
-// CACHE_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
 #define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, KV_DTYPE)               \
  vllm::reshape_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE>             \
@@ -374,7 +375,7 @@ void reshape_and_cache(
    torch::Tensor& slot_mapping,  // [num_tokens]
    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
    torch::Tensor& v_scale) {
-  int num_tokens = key.size(0);
+  int num_tokens = slot_mapping.size(0);
  int num_heads = key.size(1);
  int head_size = key.size(2);
  int block_size = key_cache.size(3);
@@ -392,8 +393,8 @@ void reshape_and_cache(
                             CALL_RESHAPE_AND_CACHE)
 }
-// KV_T is the stored data type of kv-cache.
+// KV_T is the data type of key and value tensors.
-// CACHE_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
 #define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE)         \
  vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE>       \
@@ -445,8 +446,8 @@ void reshape_and_cache_flash(
                             CALL_RESHAPE_AND_CACHE_FLASH);
 }
-// KV_T is the stored data type of kv-cache.
+// KV_T is the data type of key and value tensors.
-// CACHE_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
 #define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)              \
  vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>            \
@@ -570,3 +571,161 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
    TORCH_CHECK(false, "Unsupported data type: ", kv_cache_dtype);
  }
 }
 namespace vllm {
 // grid is launched with dimensions (batch, num_splits)
 template <typename scalar_t>
 __global__ void gather_cache(
    const scalar_t* __restrict__ src_cache,   // [NUM_BLOCKS, BLOCK_SIZE,
                                              // ENTRIES...]
    scalar_t* __restrict__ dst,               // [TOT_TOKENS, ENTRIES...]
    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
    const int32_t* __restrict__ cu_seq_lens,  // [BATCH+1]
    const int32_t block_size, const int32_t entry_size,
    const int64_t block_table_stride, const int64_t cache_block_stride,
    const int64_t cache_entry_stride, const int64_t dst_entry_stride,
    const int32_t* __restrict__ seq_starts) {  // Optional: starting offsets per
                                               // batch
  const int64_t bid = blockIdx.x;  // Batch ID
  const int32_t num_splits = gridDim.y;
  const int32_t split = blockIdx.y;
  const int32_t seq_start = cu_seq_lens[bid];
  const int32_t seq_end = cu_seq_lens[bid + 1];
  const int32_t seq_len = seq_end - seq_start;
  const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size);
  const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits);
  const int32_t split_start = split * split_blocks;
  const int32_t split_end = min((split + 1) * split_blocks, tot_blocks);
  const bool is_active_split = (split_start < tot_blocks);
  const bool is_last_split = (split_end == tot_blocks);
  if (!is_active_split) return;
  int32_t full_blocks_end = split_end;
  int32_t partial_block_size = 0;
  // Adjust the pointer for the block_table for this batch.
  // If seq_starts is provided, compute an offset based on (seq_starts[bid] /
  // page_size)
  const int32_t batch_offset = bid * block_table_stride;
  int32_t offset = 0;
  if (seq_starts != nullptr) {
    offset = seq_starts[bid] / block_size;
  }
  const int32_t* batch_block_table = block_table + batch_offset + offset;
  // Adjust dst pointer based on the cumulative sequence lengths.
  dst += seq_start * dst_entry_stride;
  if (is_last_split) {
    partial_block_size = seq_len % block_size;
    if (partial_block_size) full_blocks_end -= 1;
  }
  auto copy_entry = [&](const scalar_t* __restrict__ _src,
                        scalar_t* __restrict__ _dst) {
    for (int i = threadIdx.x; i < entry_size; i += blockDim.x)
      _dst[i] = _src[i];
  };
  for (int pid = split_start; pid < full_blocks_end; ++pid) {
    auto block_id = batch_block_table[pid];
    auto block_start_ptr = src_cache + block_id * cache_block_stride;
    auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
    for (int eid = 0; eid < block_size; ++eid) {
      copy_entry(block_start_ptr + eid * cache_entry_stride,
                 block_dst_ptr + eid * dst_entry_stride);
    }
  }
  if (partial_block_size) {
    auto block_id = batch_block_table[full_blocks_end];
    auto block_start_ptr = src_cache + block_id * cache_block_stride;
    auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride;
    for (int eid = 0; eid < partial_block_size; ++eid) {
      copy_entry(block_start_ptr + eid * cache_entry_stride,
                 block_dst_ptr + eid * dst_entry_stride);
    }
  }
 }
 }  // namespace vllm
 // Macro to dispatch the kernel based on the data type.
 #define CALL_GATHER_CACHE(CPY_DTYPE)                                    \
  vllm::gather_cache<CPY_DTYPE><<<grid, block, 0, stream>>>(            \
      reinterpret_cast<CPY_DTYPE*>(src_cache.data_ptr()),               \
      reinterpret_cast<CPY_DTYPE*>(dst.data_ptr()),                     \
      block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
      block_size, entry_size, block_table_stride, cache_block_stride,   \
      cache_entry_stride, dst_entry_stride, seq_starts_ptr);
 // Gather sequences from the cache into the destination tensor.
 //  - cu_seq_lens contains the cumulative sequence lengths for each batch
 //  - block_table contains the cache block indices for each sequence
 //  - Optionally, seq_starts (if provided) offsets the starting block index by
 //  (seq_starts[bid] / page_size)
 void gather_cache(
    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
    int64_t batch_size,
    std::optional<torch::Tensor> seq_starts = std::nullopt) {
  at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  int32_t block_size = src_cache.size(1);
  int32_t entry_size = src_cache.flatten(2, -1).size(2);
  TORCH_CHECK(block_table.dtype() == torch::kInt32,
              "block_table must be int32");
  TORCH_CHECK(cu_seq_lens.dtype() == torch::kInt32,
              "cu_seq_lens must be int32");
  if (seq_starts.has_value()) {
    TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
                "seq_starts must be int32");
  }
  TORCH_CHECK(src_cache.device() == dst.device(),
              "src_cache and dst must be on the same device");
  TORCH_CHECK(src_cache.device() == block_table.device(),
              "src_cache and block_table must be on the same device");
  TORCH_CHECK(src_cache.device() == cu_seq_lens.device(),
              "src_cache and cu_seq_lens must be on the same device");
  if (seq_starts.has_value()) {
    TORCH_CHECK(src_cache.device() == seq_starts.value().device(),
                "src_cache and seq_starts must be on the same device");
  }
  int64_t block_table_stride = block_table.stride(0);
  int64_t cache_block_stride = src_cache.stride(0);
  int64_t cache_entry_stride = src_cache.stride(1);
  int64_t dst_entry_stride = dst.stride(0);
  // Decide on the number of splits based on the batch size.
  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
  dim3 grid(batch_size, num_splits);
  dim3 block(1024);
  TORCH_CHECK(src_cache.dtype() == dst.dtype(),
              "src_cache and dst must have the same dtype");
  const int dtype_bits = src_cache.element_size() * 8;
  const int32_t* seq_starts_ptr =
      seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
  if (dtype_bits == 32) {
    CALL_GATHER_CACHE(uint32_t);
  } else if (dtype_bits == 16) {
    CALL_GATHER_CACHE(uint16_t);
  } else if (dtype_bits == 8) {
    CALL_GATHER_CACHE(uint8_t);
  } else {
    TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
  }
 }
--- a/csrc/core/math.hpp
+++ b/csrc/core/math.hpp
@@ -7,8 +7,3 @@ inline constexpr uint32_t next_pow_2(uint32_t const num) {
  if (num <= 1) return num;
  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
 }
 template <typename T>
 inline constexpr std::enable_if_t<std::is_integral_v<T>, T> ceil_div(T a, T b) {
  return (a + b - 1) / b;
 }
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -24,8 +24,8 @@ struct KernelVecType<float> {
 template <>
 struct KernelVecType<c10::Half> {
-#ifdef __powerpc64__
+#if defined(__powerpc64__) || defined(__s390x__)
-  // Power architecture-specific vector types
+  // Power and s390x architecture-specific vector types
  using q_load_vec_type = vec_op::FP32Vec8;
  using k_load_vec_type = vec_op::FP32Vec16;
  using v_load_vec_type = vec_op::FP32Vec16;
--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
@@ -3,6 +3,12 @@
 #include "cpu_types.hpp"
 #if defined(__x86_64__)
  #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2
 #else
  #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES
 #endif
 namespace {
 template <typename scalar_t>
 void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches,
@@ -82,6 +88,48 @@ void reshape_and_cache_cpu_impl(
 }
 };  // namespace
 template <typename scalar_t>
 void concat_and_cache_mla_cpu_impl(
    const scalar_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank]
    const scalar_t* __restrict__ k_pe,  // [num_tokens, pe_dim]
    scalar_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
                                      // + pe_dim)]
    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
    const int num_tokens,                      //
    const int block_stride,                    //
    const int entry_stride,                    //
    const int kv_c_stride,                     //
    const int k_pe_stride,                     //
    const int kv_lora_rank,                    //
    const int pe_dim,                          //
    const int block_size                       //
 ) {
 #pragma omp parallel for
  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
    const int64_t slot_idx = slot_mapping[token_idx];
    // NOTE: slot_idx can be -1 if the token is padded
    if (slot_idx < 0) {
      continue;
    }
    const int64_t block_idx = slot_idx / block_size;
    const int64_t block_offset = slot_idx % block_size;
    auto copy = [&](const scalar_t* __restrict__ src,
                    scalar_t* __restrict__ dst, int src_stride, int dst_stride,
                    int size, int offset) {
      for (int i = 0; i < size; i++) {
        const int64_t src_idx = token_idx * src_stride + i;
        const int64_t dst_idx =
            block_idx * block_stride + block_offset * entry_stride + i + offset;
        dst[dst_idx] = src[src_idx];
      }
    };
    copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
    copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
  }
 }
 // Note: the key_caches and value_caches vectors are constant but
 // not the Tensors they contain. The vectors need to be const refs
 // in order to satisfy pytorch's C++ operator registration code.
@@ -95,13 +143,12 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
  }
  const int element_num_per_block = key_caches[0][0].numel();
-  VLLM_DISPATCH_FLOATING_TYPES(
+  DISPATCH_MACRO(key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
-      key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
+    CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
-        CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
+    copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, block_mapping,
-        copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, block_mapping,
+                                   element_num_per_block, num_layers);
-                                       element_num_per_block, num_layers);
+    CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
-        CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
+  });
      });
 }
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
@@ -118,15 +165,46 @@ void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
  int key_stride = key.stride(0);
  int value_stride = value.stride(0);
  DISPATCH_MACRO(key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
    CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl)
    reshape_and_cache_cpu_impl<scalar_t>(
        key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
        key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
        slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride, value_stride,
        num_heads, head_size, block_size, x);
    CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl)
  });
 }
 void concat_and_cache_mla(
    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
                                  // pe_dim)]
    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
    const std::string& kv_cache_dtype, torch::Tensor& scale) {
  int num_tokens = slot_mapping.size(0);
  int kv_lora_rank = kv_c.size(1);
  int pe_dim = k_pe.size(1);
  int block_size = kv_cache.size(1);
  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
  TORCH_CHECK(kv_cache_dtype != "fp8");
  int kv_c_stride = kv_c.stride(0);
  int k_pe_stride = k_pe.stride(0);
  int block_stride = kv_cache.stride(0);
  int entry_stride = kv_cache.stride(1);
  VLLM_DISPATCH_FLOATING_TYPES(
-      key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
+      kv_c.scalar_type(), "concat_and_cache_mla_cpu_impl", [&] {
-        CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl)
+        CPU_KERNEL_GUARD_IN(concat_and_cache_mla_cpu_impl)
-        reshape_and_cache_cpu_impl<scalar_t>(
+        concat_and_cache_mla_cpu_impl<scalar_t>(
-            key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
+            kv_c.data_ptr<scalar_t>(), k_pe.data_ptr<scalar_t>(),
-            key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
+            kv_cache.data_ptr<scalar_t>(), slot_mapping.data_ptr<int64_t>(),
-            slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride,
+            num_tokens, block_stride, entry_stride, kv_c_stride, k_pe_stride,
-            value_stride, num_heads, head_size, block_size, x);
+            kv_lora_rank, pe_dim, block_size);
-        CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl)
+        CPU_KERNEL_GUARD_OUT(concat_and_cache_mla_cpu_impl)
      });
 }
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@@ -7,6 +7,9 @@
 #elif defined(__POWER9_VECTOR__)
  // ppc implementation
  #include "cpu_types_vsx.hpp"
 #elif defined(__s390x__)
  // s390 implementation
  #include "cpu_types_vxe.hpp"
 #elif defined(__aarch64__)
  // arm implementation
  #include "cpu_types_arm.hpp"
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -2,6 +2,10 @@
 #include <torch/all.h>
 #include <cmath>
 #if defined(__APPLE__)
  #include "omp.h"
 #endif
 namespace vec_op {
 #ifdef ARM_BF16_SUPPORT
--- a/csrc/cpu/cpu_types_vxe.hpp
+++ b/csrc/cpu/cpu_types_vxe.hpp
@@ -0,0 +1,480 @@
 #ifndef CPU_TYPES_VXE_HPP
 #define CPU_TYPES_VXE_HPP
 #include <vecintrin.h>
 #include <cmath>
 #include <torch/all.h>
 namespace vec_op {
 #define vec_neg(a) (-(a))
 #define vec_add(a, b) ((a) + (b))
 #define vec_sub(a, b) ((a) - (b))
 #define vec_mul(a, b) ((a) * (b))
 #define vec_div(a, b) ((a) / (b))
 #define vec_sr(a, b) ((a) >> (b))  // Vector Shift Right Algebaic
 #define vec_sl(a, b) ((a) << (b))  // Vector Shift Left
 // FIXME: FP16 is not fully supported in Torch-CPU
 #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 #ifndef CPU_OP_GUARD
  #define CPU_KERNEL_GUARD_IN(NAME)
  #define CPU_KERNEL_GUARD_OUT(NAME)
 #else
  #define CPU_KERNEL_GUARD_IN(NAME) \
    std::cout << #NAME << " invoked." << std::endl;
  #define CPU_KERNEL_GUARD_OUT(NAME) \
    std::cout << #NAME << " exit." << std::endl;
 #endif
 #define FORCE_INLINE __attribute__((always_inline)) inline
 namespace {
 template <typename T, T... indexes, typename F>
 constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
  (f(std::integral_constant<T, indexes>{}), ...);
 }
 };  // namespace
 template <typename T, T count, typename F,
          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
 constexpr void unroll_loop(F&& f) {
  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
 }
 template <typename T>
 struct Vec {
  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
 };
 typedef struct ss16x8x2_t {
  __vector signed short val[2];
 } ss16x8x2_t;
 typedef struct ss16x8x4_t {
  __vector signed short val[4];
 } ss16x8x4_t;
 typedef struct f32x4x2_t {
  __vector float val[2];
 } f32x4x2_t;
 typedef struct f32x4x4_t {
  __vector float val[4];
 } f32x4x4_t;
 struct FP32Vec8;
 struct FP32Vec16;
 struct BF16Vec8 : public Vec<BF16Vec8> {
  constexpr static int VEC_ELEM_NUM = 8;
  __vector signed short reg;
  explicit BF16Vec8(const void* ptr) : reg(*(__vector signed short*)ptr) {}
  explicit BF16Vec8(const FP32Vec8&);
  void save(void* ptr) const {
    *reinterpret_cast<__vector signed short*>(ptr) = reg;
  }
 };
 struct BF16Vec16 : public Vec<BF16Vec16> {
  constexpr static int VEC_ELEM_NUM = 16;
  ss16x8x2_t reg;
  explicit BF16Vec16(const void* ptr) {
    // Load 256 bits in two parts
    reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)ptr);
    reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr);
  }
  explicit BF16Vec16(const FP32Vec16&);
  void save(void* ptr) const {
    // Save 256 bits in two parts
    vec_xst(reg.val[0], 0, (signed short*)ptr);
    vec_xst(reg.val[1], 16, (signed short*)ptr);
  }
 };
 const static __vector signed short zero = vec_splats((signed short)0);
 struct BF16Vec32 : public Vec<BF16Vec32> {
  constexpr static int VEC_ELEM_NUM = 32;
  ss16x8x4_t reg;
  explicit BF16Vec32(const void* ptr)
      : reg(*reinterpret_cast<const ss16x8x4_t*>(ptr)) {}
  explicit BF16Vec32(ss16x8x4_t data) : reg(data) {}
  explicit BF16Vec32(const BF16Vec8& vec8_data)
      : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {}
  void save(void* ptr) const { *reinterpret_cast<ss16x8x4_t*>(ptr) = reg; }
 };
 struct FP32Vec4 : public Vec<FP32Vec4> {
  constexpr static int VEC_ELEM_NUM = 4;
  union AliasReg {
    __vector float reg;
    float values[VEC_ELEM_NUM];
  };
  __vector float reg;
  explicit FP32Vec4(float v) : reg(vec_splats(v)) {}
  explicit FP32Vec4() : reg(vec_splats(0.0f)) {}
  explicit FP32Vec4(const float* ptr) : reg(vec_xl(0, ptr)) {}
  explicit FP32Vec4(__vector float data) : reg(data) {}
  explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {}
 };
 struct FP32Vec8 : public Vec<FP32Vec8> {
  constexpr static int VEC_ELEM_NUM = 8;
  union AliasReg {
    f32x4x2_t reg;
    float values[VEC_ELEM_NUM];
  };
  f32x4x2_t reg;
  explicit FP32Vec8(float v) {
    reg.val[0] = vec_splats(v);
    reg.val[1] = vec_splats(v);
  }
  explicit FP32Vec8() {
    reg.val[0] = vec_splats(0.0f);
    reg.val[1] = vec_splats(0.0f);
  }
  explicit FP32Vec8(const float* ptr) {
    reg.val[0] = vec_xl(0, ptr);
    reg.val[1] = vec_xl(16, ptr);
  }
  explicit FP32Vec8(f32x4x2_t data) : reg(data) {}
  explicit FP32Vec8(const FP32Vec8& data) {
    reg.val[0] = data.reg.val[0];
    reg.val[1] = data.reg.val[1];
  }
  explicit FP32Vec8(const BF16Vec8& v) {
    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
    reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
  }
  float reduce_sum() const {
    AliasReg ar;
    ar.reg = reg;
    float result = 0;
    unroll_loop<int, VEC_ELEM_NUM>(
        [&result, &ar](int i) { result += ar.values[i]; });
    return result;
  }
  FP32Vec8 exp() const {
    // TODO: Vectorize this
    AliasReg ar;
    ar.reg = reg;
    f32x4x4_t ret;
    ret.val[0][0] = std::exp(ar.values[0]);
    ret.val[0][1] = std::exp(ar.values[1]);
    ret.val[0][2] = std::exp(ar.values[2]);
    ret.val[0][3] = std::exp(ar.values[3]);
    ret.val[1][0] = std::exp(ar.values[4]);
    ret.val[1][1] = std::exp(ar.values[5]);
    ret.val[1][2] = std::exp(ar.values[6]);
    ret.val[1][3] = std::exp(ar.values[7]);
    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
  }
  FP32Vec8 tanh() const {
    // TODO: Vectorize this
    AliasReg ar;
    ar.reg = reg;
    f32x4x4_t ret;
    ret.val[0][0] = std::tanh(ar.values[0]);
    ret.val[0][1] = std::tanh(ar.values[1]);
    ret.val[0][2] = std::tanh(ar.values[2]);
    ret.val[0][3] = std::tanh(ar.values[3]);
    ret.val[1][0] = std::tanh(ar.values[4]);
    ret.val[1][1] = std::tanh(ar.values[5]);
    ret.val[1][2] = std::tanh(ar.values[6]);
    ret.val[1][3] = std::tanh(ar.values[7]);
    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
  }
  FP32Vec8 er() const {
    // TODO: Vectorize this
    AliasReg ar;
    ar.reg = reg;
    f32x4x4_t ret;
    ret.val[0][0] = std::erf(ar.values[0]);
    ret.val[0][1] = std::erf(ar.values[1]);
    ret.val[0][2] = std::erf(ar.values[2]);
    ret.val[0][3] = std::erf(ar.values[3]);
    ret.val[1][0] = std::erf(ar.values[4]);
    ret.val[1][1] = std::erf(ar.values[5]);
    ret.val[1][2] = std::erf(ar.values[6]);
    ret.val[1][3] = std::erf(ar.values[7]);
    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
  }
  FP32Vec8 operator*(const FP32Vec8& b) const {
    return FP32Vec8(
        {vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])});
  }
  FP32Vec8 operator+(const FP32Vec8& b) const {
    return FP32Vec8(
        {vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])});
  }
  FP32Vec8 operator-(const FP32Vec8& b) const {
    return FP32Vec8(
        {vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])});
  }
  FP32Vec8 operator/(const FP32Vec8& b) const {
    return FP32Vec8(
        {vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])});
  }
  void save(float* ptr) const {
    vec_xst(reg.val[0], 0, ptr);
    vec_xst(reg.val[1], 16, ptr);
  }
 };
 struct FP32Vec16 : public Vec<FP32Vec16> {
  constexpr static int VEC_ELEM_NUM = 16;
  union AliasReg {
    f32x4x4_t reg;
    float values[VEC_ELEM_NUM];
  };
  f32x4x4_t reg;
  explicit FP32Vec16(float v) {
    reg.val[0] = vec_splats(v);
    reg.val[1] = vec_splats(v);
    reg.val[2] = vec_splats(v);
    reg.val[3] = vec_splats(v);
  }
  explicit FP32Vec16() {
    reg.val[0] = vec_splats(0.0f);
    reg.val[1] = vec_splats(0.0f);
    reg.val[2] = vec_splats(0.0f);
    reg.val[3] = vec_splats(0.0f);
  }
  explicit FP32Vec16(const float* ptr) {
    reg.val[0] = vec_xl(0, ptr);
    reg.val[1] = vec_xl(16, ptr);
    reg.val[2] = vec_xl(32, ptr);
    reg.val[3] = vec_xl(48, ptr);
  }
  explicit FP32Vec16(f32x4x4_t data) : reg(data) {}
  explicit FP32Vec16(const FP32Vec16& data) {
    reg.val[0] = data.reg.val[0];
    reg.val[1] = data.reg.val[1];
    reg.val[2] = data.reg.val[2];
    reg.val[3] = data.reg.val[3];
  }
  explicit FP32Vec16(const FP32Vec4& data) {
    reg.val[0] = data.reg;
    reg.val[1] = data.reg;
    reg.val[2] = data.reg;
    reg.val[3] = data.reg;
  }
  explicit FP32Vec16(const FP32Vec8& data) {
    reg.val[0] = data.reg.val[0];
    reg.val[1] = data.reg.val[1];
    reg.val[2] = data.reg.val[0];
    reg.val[3] = data.reg.val[1];
  }
  explicit FP32Vec16(const BF16Vec16& v) {
    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
    reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
    reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
    reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
  }
  explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
  FP32Vec16 operator*(const FP32Vec16& b) const {
    return FP32Vec16(f32x4x4_t({vec_mul(reg.val[0], b.reg.val[0]),
                                vec_mul(reg.val[1], b.reg.val[1]),
                                vec_mul(reg.val[2], b.reg.val[2]),
                                vec_mul(reg.val[3], b.reg.val[3])}));
  }
  FP32Vec16 operator+(const FP32Vec16& b) const {
    return FP32Vec16(f32x4x4_t({vec_add(reg.val[0], b.reg.val[0]),
                                vec_add(reg.val[1], b.reg.val[1]),
                                vec_add(reg.val[2], b.reg.val[2]),
                                vec_add(reg.val[3], b.reg.val[3])}));
  }
  FP32Vec16 operator-(const FP32Vec16& b) const {
    return FP32Vec16(f32x4x4_t({vec_sub(reg.val[0], b.reg.val[0]),
                                vec_sub(reg.val[1], b.reg.val[1]),
                                vec_sub(reg.val[2], b.reg.val[2]),
                                vec_sub(reg.val[3], b.reg.val[3])}));
  }
  FP32Vec16 operator/(const FP32Vec16& b) const {
    return FP32Vec16(f32x4x4_t({vec_div(reg.val[0], b.reg.val[0]),
                                vec_div(reg.val[1], b.reg.val[1]),
                                vec_div(reg.val[2], b.reg.val[2]),
                                vec_div(reg.val[3], b.reg.val[3])}));
  }
  float reduce_sum() const {
    AliasReg ar;
    ar.reg = reg;
    float result = 0;
    unroll_loop<int, VEC_ELEM_NUM>(
        [&result, &ar](int i) { result += ar.values[i]; });
    return result;
  }
  template <int group_size>
  float reduce_sub_sum(int idx) {
    static_assert(VEC_ELEM_NUM % group_size == 0);
    AliasReg ar;
    ar.reg = reg;
    float result = 0;
    const int start = idx * group_size;
    unroll_loop<int, group_size>(
        [&result, &start, ar](int i) { result += ar.values[start + i]; });
    return result;
  }
  void save(float* ptr) const {
    vec_xst(reg.val[0], 0, ptr);
    vec_xst(reg.val[1], 16, ptr);
    vec_xst(reg.val[2], 32, ptr);
    vec_xst(reg.val[3], 48, ptr);
  }
 };
 template <typename T>
 struct VecType {
  using vec_type = void;
 };
 template <typename T>
 using vec_t = typename VecType<T>::vec_type;
 template <>
 struct VecType<float> {
  using vec_type = FP32Vec8;
 };
 template <>
 struct VecType<c10::BFloat16> {
  using vec_type = BF16Vec8;
 };
 template <typename T>
 void storeFP32(float v, T* ptr) {
  *ptr = v;
 }
 inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
  acc = acc + a * b;
 }
 namespace c10 {
 struct BFloat16 {
  uint16_t value;  // Assume BFloat16 is defined as a struct containing a 16-bit
                   // value.
 };
 }  // namespace c10
 template <>
 inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
  c10::BFloat16 __attribute__((__may_alias__))* v_ptr =
      reinterpret_cast<c10::BFloat16*>(&v);
  *ptr = *(v_ptr + 1);
 }
 #ifndef __VEC_CLASS_FP_NAN
  #define __VEC_CLASS_FP_NAN (1 << 6)
 #endif
 const static __vector unsigned char omask = {2,  3,  6,  7,  10, 11, 14, 15,
                                             18, 19, 22, 23, 26, 27, 30, 31};
 const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff,
                                           0x00007fff};
 const static __vector unsigned int nan = {0x7fc00000, 0x7fc00000, 0x7fc00000,
                                          0x7fc00000};
 const static __vector unsigned int sh16 = {16, 16, 16, 16};
 const static __vector unsigned int one = {1, 1, 1, 1};
 inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
  __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
  __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
  int cc;
  __vector __bool int sel0 =
      vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
  __vector __bool int sel1 =
      vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc);
  inp0 = vec_sel(inp0, nan, sel0) >> sh16;
  inp1 = vec_sel(inp1, nan, sel1) >> sh16;
  reg = (__vector signed short)vec_perm(inp0, inp1, omask);
 }
 inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
  __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
  __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
  __vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
  __vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
  int cc;
  __vector __bool int sel0 =
      vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
  __vector __bool int sel1 =
      vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc);
  __vector __bool int sel2 =
      vec_fp_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN, &cc);
  __vector __bool int sel3 =
      vec_fp_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN, &cc);
  inp0 = vec_sel(inp0, nan, sel0) >> sh16;
  inp1 = vec_sel(inp1, nan, sel1) >> sh16;
  inp2 = vec_sel(inp2, nan, sel2) >> sh16;
  inp3 = vec_sel(inp3, nan, sel3) >> sh16;
  reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
  reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
 }
 inline void prefetch(const void* addr) { void __dcbt(const void* addr); }
 };  // namespace vec_op
 #endif
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -16,9 +16,18 @@ namespace vec_op {
  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
 #define VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(...)        \
  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
  AT_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__)
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 #define VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(TYPE, NAME,                                \
                     VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(__VA_ARGS__))
 #ifndef CPU_OP_GUARD
  #define CPU_KERNEL_GUARD_IN(NAME)
  #define CPU_KERNEL_GUARD_OUT(NAME)
@@ -69,9 +78,14 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
  __m256i reg;
  // normal load
  explicit FP16Vec16(const void* ptr)
      : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
  // non-temproal load
  explicit FP16Vec16(bool, void* ptr)
      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
  explicit FP16Vec16(const FP32Vec16&);
  void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
@@ -101,9 +115,14 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
  __m256i reg;
  // normal load
  explicit BF16Vec16(const void* ptr)
      : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
  // non-temproal load
  explicit BF16Vec16(bool, void* ptr)
      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
  explicit BF16Vec16(const FP32Vec16&);
  void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
@@ -121,6 +140,8 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
  __m512i reg;
  explicit BF16Vec32() : reg(_mm512_setzero_si512()) {}
  explicit BF16Vec32(const void* ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
  explicit BF16Vec32(__m512i data) : reg(data) {}
@@ -302,8 +323,13 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
  explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
  // normal load
  explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
  // non-temproal load
  explicit FP32Vec16(bool, void* ptr)
      : reg((__m512)_mm512_stream_load_si512(ptr)) {}
  explicit FP32Vec16(__m512 data) : reg(data) {}
  explicit FP32Vec16(const FP32Vec4& data)
@@ -536,6 +562,33 @@ struct INT8Vec16 : public Vec<INT8Vec16> {
    _mm_mask_storeu_epi8(ptr, mask, reg);
  }
 };
 struct INT8Vec64 : public Vec<INT8Vec64> {
  constexpr static int VEC_ELEM_NUM = 64;
  union AliasReg {
    __m512i reg;
    int8_t values[VEC_ELEM_NUM];
  };
  __m512i reg;
  // normal load
  explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
  // non-temproal load
  explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
  void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
  void save(int8_t* ptr, const int elem_num) const {
    constexpr uint64_t M = 0xFFFFFFFFFFFFFFFF;
    __mmask64 mask = _cvtu64_mask64(M >> (64 - elem_num));
    _mm512_mask_storeu_epi8(ptr, mask, reg);
  }
  // non-temproal save
  void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
 };
 #endif
 template <typename T>
@@ -646,6 +699,22 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
 inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); }
 #ifdef __AVX512F__
 inline void non_temporal_save(FP16Vec16& vec, void* ptr) {
  _mm256_stream_si256((__m256i*)ptr, vec.reg);
 }
 inline void non_temporal_save(BF16Vec32& vec, void* ptr) {
  _mm512_stream_si512((__m512i*)ptr, vec.reg);
 }
 inline void non_temporal_save(BF16Vec16& vec, void* ptr) {
  _mm256_stream_si256((__m256i*)ptr, vec.reg);
 }
 inline void non_temporal_save(FP32Vec16& vec, void* ptr) {
  _mm512_stream_ps((float*)ptr, vec.reg);
 }
 #endif
 inline void mem_barrier() { _mm_mfence(); }
 };  // namespace vec_op
 #endif
--- a/csrc/cpu/mla_decode.cpp
+++ b/csrc/cpu/mla_decode.cpp
@@ -0,0 +1,393 @@
 #include "cpu_types.hpp"
 #include <float.h>
 namespace {
 template <typename scalar_t>
 struct KernelVecType {
  using qk_load_vec_type = void;
  using qk_vec_type = void;
  using v_load_vec_type = void;
 };
 template <>
 struct KernelVecType<float> {
  using qk_load_vec_type = vec_op::FP32Vec16;
  using qk_vec_type = vec_op::FP32Vec16;
  using v_load_vec_type = vec_op::FP32Vec16;
 };
 template <>
 struct KernelVecType<c10::Half> {
 #if defined(__powerpc64__) || defined(__s390x__)
  // Power and s390x architecture-specific vector types
  using qk_load_vec_type = vec_op::FP32Vec16;
  using qk_vec_type = vec_op::FP32Vec16;
  using v_load_vec_type = vec_op::FP32Vec16;
 #else
  // Fallback for other architectures, including x86
  using qk_load_vec_type = vec_op::FP16Vec16;
  using qk_vec_type = vec_op::FP32Vec16;
  using v_load_vec_type = vec_op::FP16Vec16;
 #endif
 };
 #ifdef __AVX512BF16__
 template <>
 struct KernelVecType<c10::BFloat16> {
  using qk_load_vec_type = vec_op::BF16Vec32;
  using qk_vec_type = vec_op::BF16Vec32;
  using v_load_vec_type = vec_op::BF16Vec16;
 };
 #elif defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
 // pass
 #else
 template <>
 struct KernelVecType<c10::BFloat16> {
  using qk_load_vec_type = vec_op::BF16Vec16;
  using qk_vec_type = vec_op::FP32Vec16;
  using v_load_vec_type = vec_op::BF16Vec16;
 };
 #endif
 template <int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE, int HEAD_UNROLL,
          typename qk_vec_type>
 void mla_decode_block_head(
    const qk_vec_type* __restrict__ q_vecs,          // [HEAD_UNROLL, head_dim]
    const qk_vec_type* __restrict__ k_vecs,          // [block_size, head_dim]
    const vec_op::FP32Vec16* __restrict v_vecs_f32,  // [block_size, v_head_dim]
    float* __restrict__ acc_out,  // [HEAD_UNROLL, v_head_dim]
    float* __restrict__ acc_lse,  // [HEAD_UNROLL]
    const float scale, const int num_tokens) {
  using f32_vec_type = vec_op::FP32Vec16;
  constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
  constexpr int V_NUM_ELEM = f32_vec_type::VEC_ELEM_NUM;
  float logits[BLOCK_SIZE][HEAD_UNROLL] = {};  // initialize to zeros
  float max_val[HEAD_UNROLL];
  std::fill(max_val, max_val + HEAD_UNROLL, -FLT_MAX);
  f32_vec_type acc_vec[BLOCK_SIZE][HEAD_UNROLL];
  for (int i = 0; i < HEAD_DIM; i += QK_NUM_ELEM) {
    // load to registers
    qk_vec_type q_vec[HEAD_UNROLL];
 #pragma unroll
    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
      q_vec[unroll] =
          qk_vec_type{q_vecs[(i + unroll * HEAD_DIM) / QK_NUM_ELEM]};
    for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
      qk_vec_type k_vec(k_vecs[(block_offset * HEAD_DIM + i) / QK_NUM_ELEM]);
 #pragma unroll
      for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
        vec_op::fma(acc_vec[block_offset][unroll], q_vec[unroll], k_vec);
    }
  }
  for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
 #pragma unroll
    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
      const float acc = acc_vec[block_offset][unroll].reduce_sum() * scale;
      logits[block_offset][unroll] = acc;
      max_val[unroll] = std::max(max_val[unroll], acc);
    }
  }
  float sum_exp[HEAD_UNROLL] = {};
  for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
 #pragma unroll
    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
      const float val =
          std::exp(logits[block_offset][unroll] - max_val[unroll]);
      logits[block_offset][unroll] = val;
      sum_exp[unroll] += val;
    }
  }
  f32_vec_type this_out[V_HEAD_DIM / V_NUM_ELEM][HEAD_UNROLL];
  for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
    // load to registers
    f32_vec_type scale_[HEAD_UNROLL];
 #pragma unroll
    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
      scale_[unroll] =
          f32_vec_type{logits[block_offset][unroll] / sum_exp[unroll]};
    for (int i = 0; i < V_HEAD_DIM; i += V_NUM_ELEM) {
      f32_vec_type v_vec(
          v_vecs_f32[(block_offset * HEAD_DIM + i) / V_NUM_ELEM]);
 #pragma unroll
      for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
        vec_op::fma(this_out[i / V_NUM_ELEM][unroll], v_vec, scale_[unroll]);
    }
  }
  // merge attention state
  // section 2.2 in https://arxiv.org/pdf/2501.01005
  f32_vec_type prev_scale[HEAD_UNROLL];
  f32_vec_type curr_scale[HEAD_UNROLL];
 #pragma unroll
  for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
    const float prev_lse = acc_lse[unroll];
    const float curr_lse = std::log(sum_exp[unroll]) +
                           max_val[unroll];  // add back max_val to get true lse
    // softmax trick
    const float max_lse = std::max(prev_lse, curr_lse);
    const float prev_sum_exp = std::exp(prev_lse - max_lse);
    const float curr_sum_exp = std::exp(curr_lse - max_lse);
    const float new_sum_exp = prev_sum_exp + curr_sum_exp;
    acc_lse[unroll] = std::log(new_sum_exp) + max_lse;
    prev_scale[unroll] = f32_vec_type{prev_sum_exp / new_sum_exp};
    curr_scale[unroll] = f32_vec_type{curr_sum_exp / new_sum_exp};
  }
  for (int i = 0; i < V_HEAD_DIM; i += V_NUM_ELEM) {
 #pragma unroll
    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
      f32_vec_type o_vec(acc_out + i + V_HEAD_DIM * unroll);
      o_vec = o_vec * prev_scale[unroll] +
              this_out[i / V_NUM_ELEM][unroll] * curr_scale[unroll];
      o_vec.save(acc_out + i + V_HEAD_DIM * unroll);
    }
  }
  q_vecs += HEAD_DIM / QK_NUM_ELEM * HEAD_UNROLL;
  acc_out += V_HEAD_DIM * HEAD_UNROLL;
 }
 template <typename scalar_t, int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE,
          typename qk_vec_type>
 void mla_decode_block(
    const qk_vec_type* __restrict__ q_vecs,  // [num_heads, head_dim]
    const scalar_t* __restrict__ kv_cache,   // [block_size, head_dim]
    float* __restrict__ acc_out,             // [num_heads, v_head_dim]
    float* __restrict__ acc_lse,             // [num_heads]
    const int num_heads, const float scale, const int num_tokens) {
  using qk_load_vec_type = typename KernelVecType<scalar_t>::qk_load_vec_type;
  static_assert(
      std::is_same<qk_vec_type,
                   typename KernelVecType<scalar_t>::qk_vec_type>::value);
  using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
  using f32_vec_type = vec_op::FP32Vec16;
  static_assert(qk_load_vec_type::VEC_ELEM_NUM == qk_vec_type::VEC_ELEM_NUM);
  static_assert(v_load_vec_type::VEC_ELEM_NUM == f32_vec_type::VEC_ELEM_NUM);
  constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
  constexpr int V_NUM_ELEM = v_load_vec_type::VEC_ELEM_NUM;
  const qk_vec_type* k_vecs;
  const f32_vec_type* v_vecs_f32;
  float* kv_cache_f32 = nullptr;
  if constexpr (!std::is_same<scalar_t, float>::value) {
    // convert KV cache block to FP32 to reuse it across query heads and
    // attn @ V computation, since FP16/BF16->FP32 is expensive.
    // TODO: move malloc outside of this fn to reuse across iterations.
    const int nbytes = BLOCK_SIZE * HEAD_DIM * sizeof(float);
    kv_cache_f32 = static_cast<float*>(std::aligned_alloc(64, nbytes));
    for (int block_offset = 0; block_offset < num_tokens; ++block_offset)
      for (int i = 0; i < HEAD_DIM; i += V_NUM_ELEM) {
        v_load_vec_type kv_load_vec(kv_cache + block_offset * HEAD_DIM + i);
        f32_vec_type kv_vec_f32(kv_load_vec);
        kv_vec_f32.save(kv_cache_f32 + block_offset * HEAD_DIM + i);
      }
    if constexpr (std::is_same<qk_load_vec_type, qk_vec_type>::value) {
      // for AVX512_BF16, Q @ K.T uses BF16 for K (no conversion)
      // NOTE: in this case, we only need to convert the V section to FP32.
      // But for simplicity, we will convert the whole KV block to FP32.
      k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache);
    } else {
      k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache_f32);
    }
    // attn @ V always use FP32 for V, since attn is FP32.
    v_vecs_f32 = reinterpret_cast<const f32_vec_type*>(kv_cache_f32);
  } else {
    // KV cache is FP32. don't need to do anything.
    k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache);
    v_vecs_f32 = reinterpret_cast<const f32_vec_type*>(kv_cache);
  }
  // compute 2 heads at the same time to improve ILP and
  // take advantage of register cache for K and V.
  constexpr int HEAD_UNROLL = 2;
  for (int iter = 0; iter < num_heads / HEAD_UNROLL; ++iter) {
    mla_decode_block_head<HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE, HEAD_UNROLL>(
        q_vecs, k_vecs, v_vecs_f32, acc_out, acc_lse, scale, num_tokens);
    q_vecs += HEAD_UNROLL * HEAD_DIM / QK_NUM_ELEM;
    acc_out += HEAD_UNROLL * V_HEAD_DIM;
    acc_lse += HEAD_UNROLL;
  }
  // take care of the remaining heads
  for (int iter = 0; iter < num_heads % HEAD_UNROLL; ++iter) {
    mla_decode_block_head<HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE, 1>(
        q_vecs, k_vecs, v_vecs_f32, acc_out, acc_lse, scale, num_tokens);
    q_vecs += HEAD_DIM / QK_NUM_ELEM;
    acc_out += V_HEAD_DIM;
    acc_lse += 1;
  }
  if (kv_cache_f32 != nullptr) {
    std::free(kv_cache_f32);
  }
 }
 }  // namespace
 template <typename scalar_t, int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE>
 void mla_decode_kvcache_cpu_impl(
    scalar_t* __restrict__ out,             // [num_seqs, num_heads, v_head_dim]
    const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_dim]
    const scalar_t* __restrict__ kv_cache,  // [num_blocks, block_size,
                                            // head_dim]
    const int num_heads, const float scale,
    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
    const int* __restrict__ seq_lens,      // [num_seqs]
    const int max_num_blocks_per_seq, const int o_stride, const int q_stride,
    const int kv_stride, const int num_seqs) {
  using qk_load_vec_type = typename KernelVecType<scalar_t>::qk_load_vec_type;
  using qk_vec_type = typename KernelVecType<scalar_t>::qk_vec_type;
  constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
  // shared across threads
  const int max_threads = omp_get_max_threads();
  const int acc_out_nbytes =
      max_threads * num_heads * V_HEAD_DIM * sizeof(float);
  float* acc_out = static_cast<float*>(std::aligned_alloc(64, acc_out_nbytes));
  std::vector<float> acc_lse(max_threads * num_heads);
  // allocate memory to pre-convert query to FP32 later
  float* q_f32;
  constexpr bool PRE_CONVERT_QUERY =
      !std::is_same<scalar_t, float>::value &&
      std::is_same<qk_vec_type, vec_op::FP32Vec16>::value;
  if constexpr (PRE_CONVERT_QUERY) {
    const int q_f32_nbytes = num_heads * HEAD_DIM * sizeof(float);
    q_f32 = static_cast<float*>(std::aligned_alloc(64, q_f32_nbytes));
  }
 #pragma omp parallel
  {
    const int num_threads = omp_get_num_threads();
    const int thread_id = omp_get_thread_num();
    float* __restrict__ acc_out_thread =
        acc_out + thread_id * num_heads * V_HEAD_DIM;
    float* __restrict__ acc_lse_thread = acc_lse.data() + thread_id * num_heads;
    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
      // reset accumulator
      std::fill(acc_out_thread, acc_out_thread + num_heads * V_HEAD_DIM, 0.0f);
      std::fill(acc_lse_thread, acc_lse_thread + num_heads, -FLT_MAX);
      const int seq_len = seq_lens[seq_idx];
      const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
      const int last_block_size = seq_len - (block_num - 1) * BLOCK_SIZE;
      const qk_vec_type* q_vecs;
      if constexpr (PRE_CONVERT_QUERY) {
 // pre-convert query to FP32 since FP16/BF16->FP32 is slow.
 #pragma omp for
        for (int i = 0; i < num_heads * HEAD_DIM; i += QK_NUM_ELEM) {
          qk_load_vec_type q_load_vec(q + seq_idx * q_stride + i);
          qk_vec_type q_vec(q_load_vec);
          q_vec.save(q_f32 + i);
        }
        q_vecs = reinterpret_cast<const qk_vec_type*>(q_f32);
      } else {
        q_vecs = reinterpret_cast<const qk_vec_type*>(q + seq_idx * q_stride);
      }
 #pragma omp for
      for (int block_idx = 0; block_idx < block_num; ++block_idx) {
        const int physical_block_idx =
            block_tables[seq_idx * max_num_blocks_per_seq + block_idx];
        const int num_tokens =
            block_idx < block_num - 1 ? BLOCK_SIZE : last_block_size;
        mla_decode_block<scalar_t, HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE>(
            q_vecs, kv_cache + physical_block_idx * kv_stride, acc_out_thread,
            acc_lse_thread, num_heads, scale, num_tokens);
      }
 // merge attention states across threads
 // section 2.2 in https://arxiv.org/pdf/2501.01005
 // each thread is responsible for 1 head
 #pragma omp for
      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
        float* acc_lse_head = acc_lse.data() + head_idx;
        float* acc_out_head = acc_out + head_idx * V_HEAD_DIM;
        float max_val = -FLT_MAX;
        for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
          max_val = std::max(max_val, acc_lse_head[thread_id_ * num_heads]);
        }
        float sum_exp = 0.0f;
        for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
          float val = std::exp(acc_lse_head[thread_id_ * num_heads] - max_val);
          acc_lse_head[thread_id_ * num_heads] = val;
          sum_exp += val;
        }
        float inv_sum = 1.0f / sum_exp;
        float out_head[V_HEAD_DIM] = {};
        for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
          float scale_ = acc_lse_head[thread_id_ * num_heads] * inv_sum;
          for (int i = 0; i < V_HEAD_DIM; ++i) {
            out_head[i] +=
                acc_out_head[thread_id_ * num_heads * V_HEAD_DIM + i] * scale_;
          }
        }
        for (int i = 0; i < V_HEAD_DIM; ++i) {
          vec_op::storeFP32(out_head[i], out + seq_idx * o_stride +
                                             head_idx * V_HEAD_DIM + i);
        }
      }
    }
  }
  if (PRE_CONVERT_QUERY) {
    std::free(q_f32);
  }
  std::free(acc_out);
 }
 void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
                        torch::Tensor& kv_cache, double scale,
                        torch::Tensor& block_tables, torch::Tensor& seq_lens) {
  const int num_seqs = query.size(0);
  const int num_heads = query.size(1);
  const int head_dim = query.size(2);
  const int block_size = kv_cache.size(1);
  const int v_head_dim = out.size(2);
  const int max_num_blocks_per_seq = block_tables.size(1);
  const int o_stride = out.stride(0);
  const int q_stride = query.stride(0);
  const int kv_stride = kv_cache.stride(0);
  VLLM_DISPATCH_FLOATING_TYPES(
      query.scalar_type(), "mla_decode_kvcache_cpu_impl", [&] {
        CPU_KERNEL_GUARD_IN(mla_decode_kvcache_cpu_impl)
        if (head_dim == 576 && v_head_dim == 512 && block_size == 16)
          mla_decode_kvcache_cpu_impl<scalar_t, 576, 512, 16>(
              out.data_ptr<scalar_t>(), query.data_ptr<scalar_t>(),
              kv_cache.data_ptr<scalar_t>(), num_heads, scale,
              block_tables.data_ptr<int>(), seq_lens.data_ptr<int>(),
              max_num_blocks_per_seq, o_stride, q_stride, kv_stride, num_seqs);
        else
          TORCH_CHECK(false, "Unsupported block size: ", block_size);
        CPU_KERNEL_GUARD_OUT(mla_decode_kvcache_cpu_impl)
      });
 }
--- a/csrc/cpu/pos_encoding.cpp
+++ b/csrc/cpu/pos_encoding.cpp
@@ -170,7 +170,7 @@ void rotary_embedding_gptj_impl(
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                      torch::Tensor& key, int64_t head_size,
                      torch::Tensor& cos_sin_cache, bool is_neox) {
-  int num_tokens = query.numel() / query.size(-1);
+  int num_tokens = positions.numel();
  int rot_dim = cos_sin_cache.size(1);
  int num_heads = query.size(-1) / head_size;
  int num_kv_heads = key.size(-1) / head_size;
--- a/csrc/cpu/quant.cpp
+++ b/csrc/cpu/quant.cpp
@@ -25,7 +25,7 @@ struct KernelVecType<c10::BFloat16> {
 template <>
 struct KernelVecType<c10::Half> {
-#ifdef __powerpc64__
+#if defined(__powerpc64__) || defined(__s390x__)
  // Power architecture-specific vector type
  using load_vec_type = vec_op::FP32Vec16;
 #else
--- a/csrc/cpu/shm.cpp
+++ b/csrc/cpu/shm.cpp
@@ -0,0 +1,781 @@
 #include "cpu/cpu_types.hpp"
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <unistd.h>
 namespace {
 #define MAX_SHM_RANK_NUM 8
 #define MAX_THREAD_NUM 12
 #define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024)
 #define MIN_THREAD_PROCESS_SIZE (8 * 1024)
 #define MAX_P2P_SEND_TENSOR_NUM 8
 template <typename scalar_t>
 struct KernelVecType {
  using scalar_vec_t = void;
 };
 template <>
 struct KernelVecType<float> {
  using scalar_vec_t = vec_op::FP32Vec16;
 };
 template <>
 struct KernelVecType<c10::BFloat16> {
  using scalar_vec_t = vec_op::BF16Vec16;
 };
 template <>
 struct KernelVecType<c10::Half> {
  using scalar_vec_t = vec_op::FP16Vec16;
 };
 enum class ThreadSHMStat : char { THREAD_READY = 0, SHM_DATA_READY, DONE };
 struct ThreadSHMContext {
  volatile ThreadSHMStat thread_stats[MAX_SHM_RANK_NUM];
  int thread_id;
  int thread_num;
  int rank;
  int group_size;
  size_t _spinning_count;
  int swizzled_ranks[MAX_SHM_RANK_NUM];
  void* thread_shm_ptrs[MAX_SHM_RANK_NUM];
  ThreadSHMContext* shm_contexts[MAX_SHM_RANK_NUM];
  ThreadSHMContext(const int thread_id, const int thread_num, const int rank,
                   const int group_size, void* thread_shm_ptr)
      : thread_id(thread_id),
        thread_num(thread_num),
        rank(rank),
        group_size(group_size),
        _spinning_count(0) {
    static_assert(sizeof(ThreadSHMContext) % 64 == 0);
    TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
    TORCH_CHECK((size_t)this % 64 == 0);
    TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
    for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
      shm_contexts[i] = nullptr;
      thread_shm_ptrs[i] = nullptr;
      swizzled_ranks[i] = (i + rank) % group_size;
      thread_stats[i] = ThreadSHMStat::DONE;
    }
    set_context(rank, this, thread_shm_ptr);
  }
  void set_context(int rank, ThreadSHMContext* ptr, void* thread_shm_ptr) {
    TORCH_CHECK(rank < MAX_SHM_RANK_NUM);
    TORCH_CHECK(ptr);
    TORCH_CHECK(thread_shm_ptr);
    TORCH_CHECK_EQ(ptr->thread_num, thread_num);
    TORCH_CHECK_EQ(ptr->thread_id, thread_id);
    shm_contexts[rank] = ptr;
    thread_shm_ptrs[rank] = thread_shm_ptr;
  }
  template <typename T>
  T* get_thread_shm_ptr(int rank) {
    return reinterpret_cast<T*>(thread_shm_ptrs[rank]);
  }
  int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; }
  void wait_for_all(ThreadSHMStat prev_stat) {
    for (int idx = 0; idx < group_size; ++idx) {
      int rank = get_swizzled_rank(idx);
      while (thread_stats[rank] == prev_stat) {
        ++_spinning_count;
        _mm_pause();
      }
    }
    vec_op::mem_barrier();
  }
  void wait_for_one(int rank, ThreadSHMStat prev_stat) {
    while (thread_stats[rank] == prev_stat) {
      ++_spinning_count;
      _mm_pause();
    }
    vec_op::mem_barrier();
  }
  void set_thread_stat(ThreadSHMStat stat) {
    for (int idx = 0; idx < group_size; ++idx) {
      int rank = get_swizzled_rank(idx);
      shm_contexts[rank]->thread_stats[this->rank] = stat;
    }
  }
  void set_thread_stat(int target_rank, ThreadSHMStat stat) {
    for (int idx = 0; idx < group_size; ++idx) {
      int rank = get_swizzled_rank(idx);
      shm_contexts[rank]->thread_stats[target_rank] = stat;
    }
  }
  // barrier for all ranks in the group, used for all2all ops
  // DONE -> THREAD_READY -> SHM_DATA_READY -> DONE -> ...
  void barrier(ThreadSHMStat next_stat) {
    if (next_stat == ThreadSHMStat::THREAD_READY) {
      set_thread_stat(ThreadSHMStat::THREAD_READY);
      wait_for_all(ThreadSHMStat::DONE);
    } else if (next_stat == ThreadSHMStat::SHM_DATA_READY) {
      set_thread_stat(ThreadSHMStat::SHM_DATA_READY);
      wait_for_all(ThreadSHMStat::THREAD_READY);
    } else if (next_stat == ThreadSHMStat::DONE) {
      set_thread_stat(ThreadSHMStat::DONE);
      wait_for_all(ThreadSHMStat::SHM_DATA_READY);
    } else {
      TORCH_CHECK(false, "Invalid next_stat to barrier.");
    }
  }
  std::string to_string() const {
    std::stringstream ss;
    ss << "SHMContext:";
    ss << "\nrank: " << rank;
    ss << "\ngroup_size: " << group_size;
    ss << "\nthread_num: " << thread_num;
    ss << "\nthread_id: " << thread_id;
    ss << "\nshm_ctx_stat_loop_seq: [";
    for (int i = 0; i < group_size; ++i) {
      ss << swizzled_ranks[i] << ", ";
    }
    ss << "]";
    ss << "\nshm_contexts: [";
    for (int i = 0; i < group_size; ++i) {
      if (shm_contexts[i]) {
        ss << shm_contexts[i]->rank << ", ";
      }
    }
    ss << "]";
    return ss.str();
  }
 };
 class SHMManager {
 public:
  explicit SHMManager(const std::string& name, const int rank,
                      const int group_size)
      : _rank(rank),
        _group_size(group_size),
        _thread_num(std::min(torch::get_num_threads(), MAX_THREAD_NUM)),
        _shm_names({""}),
        _shared_mem_ptrs({nullptr}),
        _shm_ctx(nullptr) {
    _shm_names[rank] = get_shm_name(name, rank);
    _shared_mem_ptrs[rank] = init_shm(rank);
    _shm_ctx = reinterpret_cast<ThreadSHMContext*>(_shared_mem_ptrs[rank]);
    for (int i = 0; i < _thread_num; ++i) {
      ThreadSHMContext* ctx = new (_shm_ctx + i)
          ThreadSHMContext(i, _thread_num, _rank, _group_size,
                           compute_thread_shm_ptr(_shm_ctx, i));
    }
  }
  void join(const std::string& name) {
    for (int rank_idx = 0; rank_idx < _group_size; ++rank_idx) {
      if (rank_idx != _rank) {
        TORCH_CHECK(_shm_names[rank_idx].empty());
        TORCH_CHECK(_shared_mem_ptrs[rank_idx] == nullptr);
        _shm_names[rank_idx] = get_shm_name(name, rank_idx);
        _shared_mem_ptrs[rank_idx] = init_shm(rank_idx);
        ThreadSHMContext* target_ctx =
            reinterpret_cast<ThreadSHMContext*>(_shared_mem_ptrs[rank_idx]);
        for (int thread_idx = 0; thread_idx < _thread_num; ++thread_idx) {
          _shm_ctx[thread_idx].set_context(
              rank_idx, target_ctx + thread_idx,
              compute_thread_shm_ptr(target_ctx, thread_idx));
        }
      }
    }
  }
  ~SHMManager() { destroy_shm(); }
  ThreadSHMContext* get_shm_ctx() const { return _shm_ctx; }
  static std::string get_shm_name(const std::string& name, int rank) {
    return name + "_" + std::to_string(rank);
  }
  static int64_t create_singleton_instance(const std::string& name,
                                           const int group_size,
                                           const int rank) {
    std::lock_guard<std::mutex> guard(SingletonInstancesLock);
    SingletonInstances.emplace_back(
        std::make_unique<SHMManager>(name, rank, group_size));
    return static_cast<int64_t>(SingletonInstances.size() - 1);
  }
  static SHMManager* get_singleton_instance(int64_t handle) {
    return SingletonInstances[handle].get();
  }
 protected:
  static std::vector<std::unique_ptr<SHMManager>> SingletonInstances;
  static std::mutex SingletonInstancesLock;
 private:
  static size_t round_to_alignment(size_t num) {
    return ((num + 63) / 64) * 64;
  }
  int8_t* compute_thread_shm_ptr(ThreadSHMContext* ctx, int thread_id) {
    int8_t* thread_shm_ptr =
        reinterpret_cast<int8_t*>(ctx) +
        round_to_alignment(_thread_num * sizeof(ThreadSHMContext));
    return thread_shm_ptr +
           thread_id * round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES);
  }
  size_t compute_shm_size() {
    const size_t rounded_rank_buffer_size =
        round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES) * _thread_num;
    const size_t rounded_thread_shm_ctx_size =
        round_to_alignment(_thread_num * sizeof(ThreadSHMContext));
    const size_t shm_size =
        rounded_thread_shm_ctx_size + rounded_rank_buffer_size;
    return shm_size;
  }
  void* init_shm(int target_rank) {
    const std::string& shm_name = _shm_names[target_rank];
    const int local_rank = _rank;
    const size_t shm_size = compute_shm_size();
    int fd = -1;
    if (local_rank == target_rank) {
      fd = shm_open(shm_name.c_str(), O_CREAT | O_EXCL | O_RDWR,
                    S_IRUSR | S_IWUSR);
      if (fd == -1)
        TORCH_CHECK(false, "create shm in SHMManager failed. errno: " +
                               std::to_string(errno));
      if (ftruncate(fd, shm_size) == -1)
        TORCH_CHECK(false, "ftruncate in SHMManager failed. errno: " +
                               std::to_string(errno));
    } else {
      fd = shm_open(shm_name.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
      if (fd == -1)
        TORCH_CHECK(false, "open shm in SHMManager failed. errno: " +
                               std::to_string(errno));
    }
    void* shm_ptr = mmap(nullptr, shm_size, PROT_READ | PROT_WRITE,
                         MAP_SHARED | MAP_POPULATE, fd, 0);
    if (shm_ptr == MAP_FAILED) {
      TORCH_CHECK(false,
                  "mmap in SHMManager failed. errno: " + std::to_string(errno));
    }
    if (close(fd) != 0) {
      TORCH_CHECK(
          false, "close in SHMManager failed. errno: " + std::to_string(errno));
    }
    TORCH_CHECK((size_t)shm_ptr % 64 == 0);
    return shm_ptr;
  }
  void destroy_shm() {
    std::stringstream ss;
    ss << "local rank " << _rank << ": [";
    for (int thread_id = 0; thread_id < _thread_num; ++thread_id) {
      ss << _shm_ctx[thread_id]._spinning_count << ", ";
    }
    ss << "]\n";
    for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
      if (_shared_mem_ptrs[i] != nullptr) {
        munmap(_shared_mem_ptrs[i], compute_shm_size());
      }
      if (!_shm_names[i].empty()) {
        shm_unlink(_shm_names[i].c_str());
      }
    }
  }
  int _rank;
  int _group_size;
  int _thread_num;
  std::array<std::string, MAX_SHM_RANK_NUM> _shm_names;
  std::array<void*, MAX_SHM_RANK_NUM> _shared_mem_ptrs;
  ThreadSHMContext* _shm_ctx;
 };
 namespace shm_cc_ops {
 template <typename scalar_t, typename F>
 void shm_cc_loop(ThreadSHMContext* ctx, int64_t elem_num, F&& inner_func) {
  int thread_num = ctx->thread_num;
  int64_t total_bytes = elem_num * sizeof(scalar_t);
  int64_t total_units_num =
      (total_bytes + MIN_THREAD_PROCESS_SIZE - 1) / MIN_THREAD_PROCESS_SIZE;
  int64_t per_thread_units_num =
      (total_units_num + thread_num - 1) / thread_num;
  int64_t per_unit_elem_num = MIN_THREAD_PROCESS_SIZE / sizeof(scalar_t);
  int64_t max_per_thread_iteration_elem_num =
      PER_THREAD_SHM_BUFFER_BYTES / sizeof(scalar_t);
  int64_t per_thread_elem_num = per_unit_elem_num * per_thread_units_num;
 #pragma omp parallel for schedule(static, 1)
  for (int i = 0; i < thread_num; ++i) {
    int64_t offset = i * per_thread_elem_num;
    int64_t end = std::min(elem_num, offset + per_thread_elem_num);
    int64_t curr_elem_num =
        std::min(max_per_thread_iteration_elem_num, end - offset);
    ThreadSHMContext* thread_ctx = ctx + i;
    while (curr_elem_num > 0) {
      inner_func(thread_ctx, offset, curr_elem_num);
      offset += max_per_thread_iteration_elem_num;
      curr_elem_num = std::min(max_per_thread_iteration_elem_num, end - offset);
    }
  }
 }
 };  // namespace shm_cc_ops
 namespace shm_cc_ops {
 void memcpy_from_shm(void* dst, void* src, const int64_t bytes) {
  const int64_t aligned_bytes = ((bytes >> 6) << 6);  // 64 bytes aligned
  int64_t i = 0;
 #pragma GCC unroll 4
  for (; i < aligned_bytes; i += 64) {
    vec_op::INT8Vec64 data(
        true, (int8_t*)src + i);  // stream loading shm to avoid caching
    data.save((int8_t*)dst + i);
  }
  if (aligned_bytes < bytes) {
    vec_op::INT8Vec64 data(true, (int8_t*)src + aligned_bytes);
    data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes);
  }
 }
 void memcpy_to_shm(void* dst, void* src, const int64_t bytes) {
 #pragma GCC unroll 4
  for (int64_t i = 0; i < bytes; i += 64) {
    vec_op::INT8Vec64 data((int8_t*)src + i);
    data.nt_save((int8_t*)dst + i);
  }
 }
 void memcpy(void* dst, void* src, const int64_t bytes) {
  const int64_t aligned_bytes = ((bytes >> 6) << 6);  // 64 bytes aligned
  int64_t i = 0;
 #pragma GCC unroll 4
  for (; i < aligned_bytes; i += 64) {
    vec_op::INT8Vec64 data((int8_t*)src + i);
    data.save((int8_t*)dst + i);
  }
  if (aligned_bytes < bytes) {
    vec_op::INT8Vec64 data((int8_t*)src + aligned_bytes);
    data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes);
  }
 }
 template <typename scalar_t, int RANKS>
 void all_reduce_sum_impl(ThreadSHMContext* ctx, scalar_t* data,
                         size_t elem_num) {
  CPU_KERNEL_GUARD_IN(all_reduce_sum_impl)
  using vec_t = typename KernelVecType<scalar_t>::scalar_vec_t;
  constexpr int64_t vec_elem_num = vec_t::get_elem_num();
  const int worldsize = ctx->group_size;
  shm_cc_ops::shm_cc_loop<scalar_t>(
      ctx, elem_num,
      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
          int64_t data_elem_num) {
        int rank = thread_ctx->rank;
        scalar_t* thread_shm_ptr =
            thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
        scalar_t* thread_data_ptr = data + data_offset;
        int64_t thread_data_elem_num = data_elem_num * sizeof(scalar_t);
        scalar_t* remote_data_ptrs[RANKS - 1];
        vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
          remote_data_ptrs[idx] = thread_ctx->get_thread_shm_ptr<scalar_t>(
              thread_ctx->get_swizzled_rank(idx + 1));
        });
        thread_ctx->barrier(ThreadSHMStat::THREAD_READY);
        shm_cc_ops::memcpy_to_shm(thread_shm_ptr, thread_data_ptr,
                                  thread_data_elem_num);
        thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY);
        int64_t aligned_data_elem_num =
            (data_elem_num / vec_elem_num) * vec_elem_num;
        int64_t i = 0;
 #pragma GCC unroll 4
        for (; i < aligned_data_elem_num; i += vec_elem_num) {
          vec_t local_data(thread_data_ptr + i);  // load from cache
          vec_op::FP32Vec16 local_data_fp32(local_data);
          vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
            vec_t remote_data(
                true, remote_data_ptrs[idx] + i);  // stream load from shm
            vec_op::FP32Vec16 remote_data_fp32(remote_data);
            local_data_fp32 = local_data_fp32 + remote_data_fp32;  // sum reduce
          });
          vec_t reduced_data(local_data_fp32);
          reduced_data.save(thread_data_ptr + i);
        }
        if (i < data_elem_num) {
          vec_t local_data(thread_data_ptr + i);  // load from cache
          vec_op::FP32Vec16 local_data_fp32(local_data);
          vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
            vec_t remote_data(
                true, remote_data_ptrs[idx] + i);  // stream load from shm
            vec_op::FP32Vec16 remote_data_fp32(remote_data);
            local_data_fp32 = local_data_fp32 + remote_data_fp32;  // sum reduce
          });
          vec_t reduced_data(local_data_fp32);
          reduced_data.save(thread_data_ptr + i,
                            data_elem_num - aligned_data_elem_num);
        }
        thread_ctx->barrier(ThreadSHMStat::DONE);
      });
  return;
 }
 };  // namespace shm_cc_ops
 std::vector<std::unique_ptr<SHMManager>> SHMManager::SingletonInstances = {};
 std::mutex SHMManager::SingletonInstancesLock = {};
 template <typename scalar_t>
 void shm_allreduce_sum(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num) {
  switch (ctx->group_size) {
    case 2:
      shm_cc_ops::all_reduce_sum_impl<scalar_t, 2>(ctx, data, elem_num);
      break;
    case 3:
      shm_cc_ops::all_reduce_sum_impl<scalar_t, 3>(ctx, data, elem_num);
      break;
    case 4:
      shm_cc_ops::all_reduce_sum_impl<scalar_t, 4>(ctx, data, elem_num);
      break;
    case 8:
      shm_cc_ops::all_reduce_sum_impl<scalar_t, 8>(ctx, data, elem_num);
      break;
    default:
      TORCH_CHECK(false,
                  "Invalid world size: " + std::to_string(ctx->group_size));
  }
 }
 template <typename scalar_t>
 void shm_gather_impl(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num,
                     scalar_t** outputs, const int dst) {
  CPU_KERNEL_GUARD_IN(shm_gather_impl)
  const int worldsize = ctx->group_size;
  TORCH_CHECK_LT(dst, worldsize);
  shm_cc_ops::shm_cc_loop<scalar_t>(
      ctx, elem_num,
      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
          int64_t data_elem_num) {
        int rank = thread_ctx->rank;
        scalar_t* thread_shm_ptr =
            thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
        thread_ctx->barrier(ThreadSHMStat::THREAD_READY);
        shm_cc_ops::memcpy_to_shm(thread_shm_ptr, data + data_offset,
                                  data_elem_num * sizeof(scalar_t));
        thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY);
        if (rank == dst) {
          shm_cc_ops::memcpy(outputs[rank] + data_offset, data + data_offset,
                             data_elem_num * sizeof(scalar_t));
          for (int i = 1; i < worldsize; ++i) {
            int src_rank = thread_ctx->get_swizzled_rank(i);
            scalar_t* src_ptr =
                thread_ctx->get_thread_shm_ptr<scalar_t>(src_rank);  // shm
            scalar_t* dst_ptr = outputs[src_rank] + data_offset;
            shm_cc_ops::memcpy_from_shm(dst_ptr, src_ptr,
                                        data_elem_num * sizeof(scalar_t));
          }
        }
        thread_ctx->barrier(ThreadSHMStat::DONE);
      });
  return;
 }
 struct MemPiece {
  void* ptr;
  int64_t size;
  template <typename T>
  T* data_ptr() {
    return reinterpret_cast<T*>(ptr);
  }
 };
 struct TensorListMeta {
  int64_t tensor_bytes[MAX_P2P_SEND_TENSOR_NUM];
  torch::ScalarType tensor_types[MAX_P2P_SEND_TENSOR_NUM];
  int64_t tensor_num;
  int64_t total_bytes;
  TensorListMeta() : tensor_num(0), total_bytes(0) {
    static_assert(sizeof(TensorListMeta) % 64 == 0);
    static_assert(sizeof(TensorListMeta) <
                  MIN_THREAD_PROCESS_SIZE);  // To ensure the metadata always
                                             // hold by the thread 0
    for (int i = 0; i < MAX_P2P_SEND_TENSOR_NUM; ++i) {
      tensor_bytes[i] = 0;
      tensor_ptrs[i] = nullptr;
      tensor_types[i] = torch::ScalarType::Undefined;
    }
  }
  // For send and recv
  void bind_tensor_list(std::vector<torch::Tensor>& tensor_list) {
    TORCH_CHECK(tensor_types[0] == torch::ScalarType::Undefined,
                "Re-bind TensorListMeta is not allowed.")
    TORCH_CHECK_LE(tensor_list.size(), MAX_P2P_SEND_TENSOR_NUM);
    tensor_num = tensor_list.size();
    int64_t bytes_sum = 0;
    for (int i = 0; i < tensor_list.size(); ++i) {
      torch::Tensor& t = tensor_list[i];
      TORCH_CHECK(t.is_contiguous());
      tensor_bytes[i] = t.nbytes();
      tensor_types[i] = t.scalar_type();
      tensor_ptrs[i] = t.data_ptr();
      bytes_sum += t.nbytes();
    }
    total_bytes = bytes_sum;
  }
  // For recv
  std::vector<torch::Tensor> generate_tensor_list() {
    std::vector<torch::Tensor> tensor_list;
    tensor_list.reserve(tensor_num);
    for (int i = 0; i < tensor_num; ++i) {
      int64_t bytes = tensor_bytes[i];
      auto type = tensor_types[i];
      int64_t elem_bytes = torch::elementSize(type);
      TORCH_CHECK_EQ(bytes % elem_bytes, 0);
      int64_t elem_num = bytes / elem_bytes;
      auto options = torch::TensorOptions().dtype(type).device(torch::kCPU);
      tensor_list.emplace_back(torch::empty({elem_num}, options));
    }
    return tensor_list;
  }
  MemPiece get_data(int64_t offset) {
    for (int i = 0; i < tensor_num; ++i) {
      if (offset < tensor_bytes[i]) {
        return {reinterpret_cast<int8_t*>(tensor_ptrs[i]) + offset,
                tensor_bytes[i] - offset};
      }
      offset -= tensor_bytes[i];
    }
    return {nullptr, 0};
  }
 private:
  void* tensor_ptrs[MAX_P2P_SEND_TENSOR_NUM];
  int8_t _padding[40];
 };
 void shm_send_tensor_list_impl(ThreadSHMContext* ctx,
                               const std::vector<torch::Tensor>& tensor_list) {
  CPU_KERNEL_GUARD_IN(shm_send_tensor_list_impl)
  std::vector<torch::Tensor> tensor_list_with_metadata;
  tensor_list_with_metadata.reserve(1 + tensor_list.size());
  auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
  tensor_list_with_metadata.emplace_back(
      torch::empty({sizeof(TensorListMeta)}, options));
  tensor_list_with_metadata.insert(tensor_list_with_metadata.end(),
                                   tensor_list.begin(), tensor_list.end());
  torch::Tensor& metadata_tensor = tensor_list_with_metadata[0];
  TORCH_CHECK_EQ(metadata_tensor.nbytes(), sizeof(TensorListMeta));
  TensorListMeta* metadata = new (metadata_tensor.data_ptr()) TensorListMeta();
  metadata->bind_tensor_list(tensor_list_with_metadata);
  shm_cc_ops::shm_cc_loop<int8_t>(
      ctx, metadata->total_bytes,
      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
          int64_t data_elem_num) {
        int rank = thread_ctx->rank;
        // Wait until the receiver set the stat to DONE
        thread_ctx->wait_for_one(rank, ThreadSHMStat::SHM_DATA_READY);
        int64_t curr_shm_offset = 0;
        while (curr_shm_offset < data_elem_num) {
          MemPiece frag = metadata->get_data(data_offset + curr_shm_offset);
          frag.size = std::min(frag.size, data_elem_num - curr_shm_offset);
          shm_cc_ops::memcpy(
              thread_ctx->get_thread_shm_ptr<int8_t>(rank) + curr_shm_offset,
              frag.ptr, frag.size);
          curr_shm_offset += frag.size;
        }
        thread_ctx->set_thread_stat(rank, ThreadSHMStat::SHM_DATA_READY);
      });
 }
 std::vector<torch::Tensor> shm_recv_tensor_list_impl(ThreadSHMContext* ctx,
                                                     int64_t src) {
  CPU_KERNEL_GUARD_IN(shm_recv_tensor_list_impl)
  auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
  torch::Tensor metadata_tensor =
      torch::empty({sizeof(TensorListMeta)}, options);
  // Wait until the sender set the stat of the thread 0 to SHM_DATA_READY
  ctx->wait_for_one(src, ThreadSHMStat::DONE);
  shm_cc_ops::memcpy(metadata_tensor.data_ptr(),
                     ctx->get_thread_shm_ptr<void>(src),
                     sizeof(TensorListMeta));
  TensorListMeta* src_metadata =
      reinterpret_cast<TensorListMeta*>(metadata_tensor.data_ptr());
  std::vector<torch::Tensor> tensor_list_with_metadata =
      src_metadata->generate_tensor_list();
  TensorListMeta metadata;
  metadata.bind_tensor_list(tensor_list_with_metadata);
  TORCH_CHECK_EQ(metadata.tensor_num, src_metadata->tensor_num);
  TORCH_CHECK_EQ(metadata.total_bytes, src_metadata->total_bytes);
  shm_cc_ops::shm_cc_loop<int8_t>(
      ctx, metadata.total_bytes,
      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
          int64_t data_elem_num) {
        // Wait until the sender set the stat to SHM_DATA_READY
        thread_ctx->wait_for_one(src, ThreadSHMStat::DONE);
        int64_t curr_shm_offset = 0;
        while (curr_shm_offset < data_elem_num) {
          MemPiece frag = metadata.get_data(data_offset + curr_shm_offset);
          frag.size = std::min(frag.size, data_elem_num - curr_shm_offset);
          shm_cc_ops::memcpy(
              frag.ptr,
              thread_ctx->get_thread_shm_ptr<int8_t>(src) + curr_shm_offset,
              frag.size);
          curr_shm_offset += frag.size;
        }
        thread_ctx->set_thread_stat(src, ThreadSHMStat::DONE);
      });
  std::vector<torch::Tensor> tensor_list;
  tensor_list.reserve(metadata.tensor_num - 1);
  tensor_list.insert(tensor_list.begin(), tensor_list_with_metadata.begin() + 1,
                     tensor_list_with_metadata.end());
  return tensor_list;
 }
 }  // namespace
 void shm_gather(int64_t handle, torch::Tensor& data,
                const std::optional<std::vector<torch::Tensor>>& outputs,
                int64_t dst) {
  TORCH_CHECK(data.is_contiguous())
  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_gather_impl", [&] {
    CPU_KERNEL_GUARD_IN(shm_gather_impl)
    if (outputs.has_value()) {
      TORCH_CHECK_LE(outputs->size(), MAX_SHM_RANK_NUM);
      scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr};
      for (int i = 0; i < outputs->size(); ++i) {
        output_ptrs[i] = outputs->at(i).data_ptr<scalar_t>();
      }
      shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
                      data.data_ptr<scalar_t>(), data.numel(), output_ptrs,
                      dst);
    } else {
      shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
                      data.data_ptr<scalar_t>(), data.numel(), (scalar_t**)(0),
                      dst);
    }
    CPU_KERNEL_GUARD_OUT(shm_gather_impl)
  });
 }
 void shm_all_gather(int64_t handle, const torch::Tensor& data,
                    torch::Tensor& output) {
  TORCH_CHECK(data.is_contiguous())
  TORCH_CHECK(output.is_contiguous())
  const int64_t input_elem_num = data.numel();
  const int64_t output_elem_num = output.numel();
  TORCH_CHECK_EQ(output_elem_num % input_elem_num, 0);
  const int world_size = output_elem_num / input_elem_num;
  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_all_gather_impl", [&] {
    CPU_KERNEL_GUARD_IN(shm_all_gather_impl)
    auto ctx = SHMManager::get_singleton_instance(handle)->get_shm_ctx();
    TORCH_CHECK_EQ(ctx->group_size, world_size);
    scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr};
    for (int i = 0; i < world_size; ++i) {
      output_ptrs[i] = output.data_ptr<scalar_t>() + i * input_elem_num;
    }
    shm_gather_impl(ctx, data.data_ptr<scalar_t>(), data.numel(), output_ptrs,
                    ctx->rank);
    CPU_KERNEL_GUARD_OUT(shm_all_gather_impl)
  });
 }
 void shm_allreduce(int64_t handle, torch::Tensor& data) {
  TORCH_CHECK(data.is_contiguous())
  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_allreduce_sum", [&] {
    CPU_KERNEL_GUARD_IN(shm_allreduce_sum)
    shm_allreduce_sum(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
                      data.data_ptr<scalar_t>(), data.numel());
    CPU_KERNEL_GUARD_OUT(shm_allreduce_sum)
  });
 }
 void shm_send_tensor_list(int64_t handle,
                          const std::vector<torch::Tensor>& tensor_list,
                          int64_t dst) {
  CPU_KERNEL_GUARD_IN(shm_send_tensor_list)
  shm_send_tensor_list_impl(
      SHMManager::get_singleton_instance(handle)->get_shm_ctx(), tensor_list);
  CPU_KERNEL_GUARD_OUT(shm_send_tensor_list)
 }
 std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src) {
  CPU_KERNEL_GUARD_IN(shm_recv_tensor_list)
  auto tensor_list = shm_recv_tensor_list_impl(
      SHMManager::get_singleton_instance(handle)->get_shm_ctx(), src);
  CPU_KERNEL_GUARD_OUT(shm_recv_tensor_list)
  return tensor_list;
 }
 int64_t init_shm_manager(const std::string& name, const int64_t group_size,
                         const int64_t rank) {
  return SHMManager::create_singleton_instance(name, group_size, rank);
 }
 std::string join_shm_manager(int64_t handle, const std::string& name) {
  auto shm_manager = SHMManager::get_singleton_instance(handle);
  TORCH_CHECK(shm_manager);
  shm_manager->join(name);
  return shm_manager->get_shm_ctx()->to_string();
 }
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -18,6 +18,30 @@ void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a,
                        const std::optional<torch::Tensor>& azp,
                        const std::optional<torch::Tensor>& bias);
 void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
                        torch::Tensor& kv_cache, double scale,
                        torch::Tensor& block_tables, torch::Tensor& seq_lens);
 int64_t init_shm_manager(const std::string& name, const int64_t group_size,
                         const int64_t rank);
 std::string join_shm_manager(int64_t handle, const std::string& name);
 void shm_allreduce(int64_t handle, torch::Tensor& data);
 void shm_gather(int64_t handle, torch::Tensor& data,
                const std::optional<std::vector<torch::Tensor>>& outputs,
                int64_t dst);
 void shm_all_gather(int64_t handle, const torch::Tensor& data,
                    torch::Tensor& output);
 void shm_send_tensor_list(int64_t handle,
                          const std::vector<torch::Tensor>& tensor_list,
                          int64_t dst);
 std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src);
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // vLLM custom ops
@@ -127,6 +151,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "                  Tensor? azp, Tensor? bias) -> ()");
  ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
 #endif
 // SHM CCL
 #ifdef __AVX512F__
  ops.def("init_shm_manager(str name, int group_size, int rank) -> int",
          &init_shm_manager);
  ops.def("join_shm_manager(int handle, str name) -> str", &join_shm_manager);
  ops.def("shm_allreduce(int handle, Tensor! data) -> ()");
  ops.impl("shm_allreduce", torch::kCPU, &shm_allreduce);
  ops.def(
      "shm_gather(int handle, Tensor data, Tensor[](a!)? outputs, int dst) -> "
      "()");
  ops.impl("shm_gather", torch::kCPU, &shm_gather);
  ops.def(
      "shm_all_gather(int handle, Tensor data, Tensor! output) -> "
      "()");
  ops.impl("shm_all_gather", torch::kCPU, &shm_all_gather);
  ops.def(
      "shm_send_tensor_list(int handle, Tensor[](a) tensor_list, int dst) -> "
      "()");
  ops.impl("shm_send_tensor_list", torch::kCPU, &shm_send_tensor_list);
  ops.def("shm_recv_tensor_list(int handle, int src) -> Tensor[](a)",
          &shm_recv_tensor_list);
 #endif
 }
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
@@ -150,6 +197,14 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
      "                  str kv_cache_dtype,"
      "                  Tensor k_scale, Tensor v_scale) -> ()");
  cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
  cache_ops.def(
      "concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
      "                     Tensor! kv_cache,"
      "                     Tensor slot_mapping,"
      "                     str kv_cache_dtype,"
      "                     Tensor scale) -> ()");
  cache_ops.impl("concat_and_cache_mla", torch::kCPU, &concat_and_cache_mla);
 }
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
@@ -157,4 +212,12 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
  utils.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env);
 }
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cpu), cpu_ops) {
  cpu_ops.def(
      "mla_decode_kvcache("
      "   Tensor! out, Tensor query, Tensor kv_cache,"
      "   float scale, Tensor block_tables, Tensor seq_lens) -> ()");
  cpu_ops.impl("mla_decode_kvcache", torch::kCPU, &mla_decode_kvcache);
 }
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -4,6 +4,11 @@
  #include <string>
  #include <sched.h>
 #endif
 #if __GLIBC__ == 2 && __GLIBC_MINOR__ < 30
  #include <unistd.h>
  #include <sys/syscall.h>
  #define gettid() syscall(SYS_gettid)
 #endif
 #include "cpu_types.hpp"
@@ -18,7 +23,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
 #ifndef VLLM_NUMA_DISABLED
 std::string init_cpu_threads_env(const std::string& cpu_ids) {
-  bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
+  bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str());
  TORCH_CHECK(omp_cpu_mask->size > 0);
  std::vector<int> omp_cpu_ids;
  omp_cpu_ids.reserve(omp_cpu_mask->size);
--- a/Show More
+++ b/Show More