Bump version to v0.5.5 (#7823 )

[Misc] Update marlin to use vLLMParameters (#7803 )
[github][misc] promote asking llm first (#7809 )
2024-08-23 11:35:33 -07:00 · 2024-08-23 14:30:52 -04:00 · 2024-08-23 09:38:50 -07:00 · 2024-08-23 13:12:44 +00:00 · 2024-08-23 05:46:25 +00:00 · 2024-08-22 20:06:54 -07:00
793 changed files with 78471 additions and 18388 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -1,7 +1,7 @@
 import os
 import zipfile
-MAX_SIZE_MB = 200
+MAX_SIZE_MB = 250
 def print_top_10_largest_files(zip_file):
--- a/.buildkite/download-images.sh
+++ b/.buildkite/download-images.sh
@@ -1,14 +0,0 @@
 #!/bin/bash
 set -ex
 set -o pipefail
 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 # aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
 mkdir -p images
 cd images
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
 cd -
--- a/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
+++ b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
@@ -0,0 +1,12 @@
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
 model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.671
  - name: "exact_match,flexible-extract"
    value: 0.664
 limit: 1000
 num_fewshot: 5
 trust_remote_code: True
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
@@ -0,0 +1,11 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
 model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.905
  - name: "exact_match,flexible-extract"
    value: 0.905
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
@@ -0,0 +1,11 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.752
  - name: "exact_match,flexible-extract"
    value: 0.754
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
@@ -0,0 +1,11 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.753
  - name: "exact_match,flexible-extract"
    value: 0.753
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
@@ -0,0 +1,11 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.755
  - name: "exact_match,flexible-extract"
    value: 0.755
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
@@ -1,11 +1,11 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
 model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
-    value: 0.756
+    value: 0.753
  - name: "exact_match,flexible-extract"
-    value: 0.752
+    value: 0.753
-limit: 250
+limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
@@ -0,0 +1,11 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.728
  - name: "exact_match,flexible-extract"
    value: 0.728
 limit: 250
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
@@ -0,0 +1,11 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.758
  - name: "exact_match,flexible-extract"
    value: 0.759
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@@ -0,0 +1,11 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
 model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.419
  - name: "exact_match,flexible-extract"
    value: 0.416
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
@@ -0,0 +1,11 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
 model_name: "mgoin/Minitron-4B-Base-FP8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.233
  - name: "exact_match,flexible-extract"
    value: 0.236
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
@@ -0,0 +1,11 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.578
  - name: "exact_match,flexible-extract"
    value: 0.585
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
@@ -0,0 +1,11 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.593
  - name: "exact_match,flexible-extract"
    value: 0.588
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
@@ -0,0 +1,11 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
 model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.595
  - name: "exact_match,flexible-extract"
    value: 0.582
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -1,3 +1,5 @@
 Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
 Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml
 DeepSeek-V2-Lite-Chat.yaml
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -1,2 +1,10 @@
 Meta-Llama-3-8B-Instruct.yaml
 Meta-Llama-3-8B-Instruct-FP8.yaml
 Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
 Minitron-4B-Base-FP8.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-FP8W8.yaml
 Meta-Llama-3-8B-QQQ.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.2
+#   pip install lm-eval==0.4.3
 usage() {
    echo``
@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
 done
 lm_eval --model vllm \
-  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE \
+  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \
  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
  --batch_size $BATCH_SIZE
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -14,7 +14,7 @@ import lm_eval
 import numpy
 import yaml
-RTOL = 0.02
+RTOL = 0.05
 TEST_DATA_FILE = os.environ.get(
    "LM_EVAL_TEST_DATA_FILE",
    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
@@ -23,8 +23,12 @@ TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
 def launch_lm_eval(eval_config):
    trust_remote_code = eval_config.get('trust_remote_code', False)
    model_args = f"pretrained={eval_config['model_name']}," \
-                 f"tensor_parallel_size={TP_SIZE}"
+                 f"tensor_parallel_size={TP_SIZE}," \
                 f"add_bos_token=true," \
                 f"trust_remote_code={trust_remote_code}"
    results = lm_eval.simple_evaluate(
        model="vllm",
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -1,31 +1,54 @@
 # vLLM benchmark suite
 ## Introduction
-This directory contains the performance benchmarking CI for vllm.
+This directory contains two sets of benchmark for vllm.
-The goal is to help developers know the impact of their PRs on the performance of vllm.
+- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
 - Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
 This benchmark will be *triggered* upon:
 - A PR being merged into vllm.
 - Every commit for those PRs with `perf-benchmarks` label.
-**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for more GPUs is comming later), with different models.
+See  [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
 ## Performance benchmark quick overview
 **Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
 **Benchmarking Duration**: about 1hr.
-**For benchmarking developers**: please try your best to constraint the duration of benchmarking to less than 1.5 hr so that it won't take forever to run.
+**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
-## Configuring the workload
+## Nightly benchmark quick overview
-The benchmarking workload contains three parts:
+**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B. 
 - Latency tests in `latency-tests.json`.
 - Throughput tests in `throughput-tests.json`.
 - Serving tests in `serving-tests.json`.
-See [descriptions.md](tests/descriptions.md) for detailed descriptions. 
+**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
-### Latency test
+**Benchmarking Duration**: about 3.5hrs.
 ## Trigger the benchmark
 Performance benchmark will be triggered when:
 - A PR being merged into vllm.
 - Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
 Nightly benchmark will be triggered when:
 - Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
 ## Performance benchmark details
 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 #### Latency test
 Here is an example of one test inside `latency-tests.json`:
@@ -46,19 +69,19 @@ Here is an example of one test inside `latency-tests.json`:
 In this example:
 -  The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
-  The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
+-  The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
 Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
 WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
-### Throughput test
+#### Throughput test
 The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
 The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
-### Serving test
+#### Serving test
 We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
 ```
@@ -95,9 +118,36 @@ The number of this test is less stable compared to the delay and latency benchma
 WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
-## Visualizing the results
+#### Visualizing the results
 The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
 You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
 If you do not see the table, please wait till the benchmark finish running.
 The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
 The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
 ## Nightly test details
 See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
 #### Workflow
 - The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines. 
 - Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
 - The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
 - At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
 #### Nightly tests
 In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
 #### Docker containers
 The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
 WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
 WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -11,7 +11,7 @@ steps:
            - sh
            - .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
  - wait
-  - label: "A100 Benchmark"
+  - label: "A100"
    agents:
      queue: A100
    plugins:
@@ -21,7 +21,7 @@ steps:
          containers:
          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
            command:
-            - bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
            resources:
              limits:
                nvidia.com/gpu: 8
@@ -42,7 +42,7 @@ steps:
          - name: devshm
            emptyDir:
              medium: Memory
-  # - label: "H100: NVIDIA SMI"
+  # - label: "H100"
  #   agents:
  #     queue: H100
  #   plugins:
@@ -53,7 +53,6 @@ steps:
  #       - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
  #       mount-buildkite-agent: true
  #       propagate-environment: true
  #       propagate-uid-gid: false
  #       ipc: host
  #       gpus: all
  #       environment:
--- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
+++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
@@ -1,27 +0,0 @@
 #!/usr/bin/env bash
 # NOTE(simon): this script runs inside a buildkite agent with CPU only access.
 set -euo pipefail
 # Install system packages
 apt update
 apt install -y curl jq
 # Install minijinja for templating
 curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh
 source $HOME/.cargo/env
 # If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
 if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
  PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
  if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
    echo "This PR has the 'perf-benchmarks' label. Proceeding with the nightly benchmarks."
  else
    echo "This PR does not have the 'perf-benchmarks' label. Skipping the nightly benchmarks."
    exit 0
  fi
 fi
 # Upload sample.yaml
 buildkite-agent pipeline upload .buildkite/nightly-benchmarks/benchmark-pipeline.yaml
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -0,0 +1,45 @@
 # Nightly benchmark
 The main goal of this benchmarking is two-fold:
 - Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload.
 - Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().
 ## Docker images
 We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
 - vllm/vllm-openai:v0.5.0.post1
 - nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
 - openmmlab/lmdeploy:v0.5.0
 - ghcr.io/huggingface/text-generation-inference:2.1
 <!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. -->
 ## Hardware
 One AWS node with 8x NVIDIA A100 GPUs.
 ## Workload description
 We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
 - Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 500 prompts.
 - Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
 - Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
 - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
 <!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
 ## Plots
 In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
 <img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
 ## Results
 {nightly_results_benchmarking_table}
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -0,0 +1,120 @@
 common_pod_spec: &common_pod_spec
  priorityClassName: perf-benchmark
  nodeSelector:
    nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
  volumes:
    - name: devshm
      emptyDir:
        medium: Memory
    - name: hf-cache
      hostPath:
        path: /root/.cache/huggingface
        type: Directory
 common_container_settings: &common_container_settings
  command:
    - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
  resources:
    limits:
      nvidia.com/gpu: 8
  volumeMounts:
    - name: devshm
      mountPath: /dev/shm
    - name: hf-cache
      mountPath: /root/.cache/huggingface
  env:
    - name: VLLM_USAGE_SOURCE
      value: ci-test
    - name: HF_HOME
      value: /root/.cache/huggingface
    - name: VLLM_SOURCE_CODE_LOC
      value: /workspace/build/buildkite/vllm/performance-benchmark
    - name: HF_TOKEN
      valueFrom:
        secretKeyRef:
          name: hf-token-secret
          key: token
 steps:
  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
  - label: "A100 trt benchmark"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
                <<: *common_container_settings
  - label: "A100 lmdeploy benchmark"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: openmmlab/lmdeploy:v0.5.0
                <<: *common_container_settings
  - label: "A100 vllm benchmark"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: vllm/vllm-openai:latest 
                <<: *common_container_settings
  - label: "A100 tgi benchmark"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: ghcr.io/huggingface/text-generation-inference:2.1 
                <<: *common_container_settings
  - wait
  - label: "Plot"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
            - image: vllm/vllm-openai:v0.5.0.post1
              command:
              - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
              resources:
                limits:
                  nvidia.com/gpu: 8
              volumeMounts:
              - name: devshm
                mountPath: /dev/shm
              env:
              - name: VLLM_USAGE_SOURCE
                value: ci-test
              - name: VLLM_SOURCE_CODE_LOC
                value: /workspace/build/buildkite/vllm/performance-benchmark
              - name: HF_TOKEN
                valueFrom:
                  secretKeyRef:
                    name: hf-token-secret
                    key: token
  - wait
--- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
@@ -1,47 +1,42 @@
 ## Latency tests
 This test suite aims to test vllm's end-to-end latency under a controlled setup.
 - Input length: 32 tokens.
 - Output length: 128 tokens.
 - Batch size: fixed (8).
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: end-to-end latency (mean, median, p99).
 ### Latency benchmarking results
 {latency_tests_markdown_table}
 ## Throughput tests
-This test suite aims to test vllm's throughput.
+## Throughput tests
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm to achieve maximum throughput.
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: throughput.
 ### Throughput benchmarking results
 {throughput_tests_markdown_table}
 ## Serving tests
-This test suite aims to test vllm's real serving metrics.
+## Serving tests
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
 - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - We also added a speculative decoding test for llama-3 70B, under QPS 2
 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
 ### Serving benchmarking results
 {serving_tests_markdown_table}
 ## json version of the benchmarking tables
 This section contains the data of the markdown tables above in JSON format. 
--- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@@ -0,0 +1,76 @@
 #!/bin/bash
 set -o pipefail
 set -x
 check_gpus() {
    # check the number of GPUs and GPU type.
    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
    if [[ $gpu_count -gt 0 ]]; then
        echo "GPU found."
    else
        echo "Need at least 1 GPU to run benchmarking."
        exit 1
    fi
    declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
    echo "GPU type is $gpu_type"
 }
 check_hf_token() {
    # check if HF_TOKEN is available and valid
    if [[ -z "$HF_TOKEN" ]]; then
        echo "Error: HF_TOKEN is not set."
        exit 1
    elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
        echo "Error: HF_TOKEN does not start with 'hf_'."
        exit 1
    else
        echo "HF_TOKEN is set and valid."
    fi
 }
 main() {
    check_gpus
    check_hf_token
    df -h
    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
    (which jq) || (apt-get update && apt-get -y install jq)
    cd $VLLM_SOURCE_CODE_LOC/benchmarks
    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
    # run lmdeploy
    if which lmdeploy >/dev/null; then
        echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
        bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
        exit 0
    fi
    # run tgi
    if [ -e /tgi-entrypoint.sh ]; then
        echo "tgi is available, redirect to run-tgi-nightly.sh"
        bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
        exit 0
    fi
    # run trt
    if which trtllm-build >/dev/null; then
        echo "trtllm is available, redirect to run-trt-nightly.sh"
        bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
        exit 0
    fi
    # run vllm
    if [ -e /vllm-workspace ]; then
        echo "vllm is available, redirect to run-vllm-nightly.sh"
        bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
        exit 0
    fi
 }
 main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -174,8 +174,8 @@ if __name__ == "__main__":
    # document the result
    with open(results_folder / "benchmark_results.md", "w") as f:
-        results = read_markdown(
+        results = read_markdown("../.buildkite/nightly-benchmarks/" +
-            "../.buildkite/nightly-benchmarks/tests/descriptions.md")
+                                "performance-benchmarks-descriptions.md")
        results = results.format(
            latency_tests_markdown_table=latency_md_table,
            throughput_tests_markdown_table=throughput_md_table,
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@@ -0,0 +1,26 @@
 import argparse
 from transformers import AutoTokenizer
 def main(model, cachedir):
    # Load the tokenizer and save it to the specified directory
    tokenizer = AutoTokenizer.from_pretrained(model)
    tokenizer.save_pretrained(cachedir)
    print(f"Tokenizer saved to {cachedir}")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Download and save Hugging Face tokenizer")
    parser.add_argument("--model",
                        type=str,
                        required=True,
                        help="Name of the model")
    parser.add_argument("--cachedir",
                        type=str,
                        required=True,
                        help="Directory to save the tokenizer")
    args = parser.parse_args()
    main(args.model, args.cachedir)
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@@ -0,0 +1,6 @@
 from lmdeploy.serve.openai.api_client import APIClient
 api_client = APIClient("http://localhost:8000")
 model_name = api_client.available_models[0]
 print(model_name)
--- a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
@@ -0,0 +1,102 @@
 #!/bin/bash
 server_params=$1
 common_params=$2
 model_path=$(echo "$common_params" | jq -r '.model')
 model_name="${model_path#*/}"
 model_type=$(echo "$server_params" | jq -r '.model_type')
 model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
 model_tp_size=$(echo "$common_params" | jq -r '.tp')
 max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
 max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
 max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
 trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
 cd ~
 rm -rf models
 mkdir -p models
 cd models
 models_dir=$(pwd)
 trt_model_path=${models_dir}/${model_name}-trt-ckpt
 trt_engine_path=${models_dir}/${model_name}-trt-engine
 cd ~
 rm -rf tensorrt-demo
 git clone https://github.com/neuralmagic/tensorrt-demo.git
 cd tensorrt-demo
 tensorrt_demo_dir=$(pwd)
 # make sure the parameter inside tensorrt_demo is consistent to envvar
 sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
 sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
 sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
 sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
 sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
 sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
 cd /
 rm -rf tensorrtllm_backend
 git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
 git lfs install
 cd tensorrtllm_backend
 git checkout $trt_llm_version
 tensorrtllm_backend_dir=$(pwd)
 git submodule update --init --recursive
 cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
 cd /tensorrtllm_backend
 cd ./tensorrt_llm/examples/${model_type}
 if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
    echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py"
    echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
    python ../quantization/quantize.py \
        --model_dir ${model_path} \
        --dtype ${model_dtype} \
        --tp_size ${model_tp_size} \
        --output_dir ${trt_model_path} \
        --qformat fp8 \
        --kv_cache_dtype fp8 \
        --calib_size 2
 else
    echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py"
    python3 convert_checkpoint.py \
        --model_dir ${model_path} \
        --dtype ${model_dtype} \
        --tp_size ${model_tp_size} \
        --output_dir ${trt_model_path}
 fi
 trtllm-build \
 --checkpoint_dir=${trt_model_path} \
 --gpt_attention_plugin=${model_dtype} \
 --gemm_plugin=${model_dtype} \
 --remove_input_padding=enable \
 --paged_kv_cache=enable \
 --tp_size=${model_tp_size} \
 --max_batch_size=${max_batch_size} \
 --max_input_len=${max_input_len} \
 --max_output_len=${max_output_len} \
 --max_num_tokens=${max_output_len} \
 --opt_num_tokens=${max_output_len} \
 --output_dir=${trt_engine_path} 
 cd /tensorrtllm_backend/triton_model_repo
 rm -rf ./tensorrt_llm/1/*
 cp -r ${trt_engine_path}/* ./tensorrt_llm/1
 cd /tensorrtllm_backend
 python3 scripts/launch_triton_server.py \
 --world_size=${model_tp_size} \
 --model_repo=/tensorrtllm_backend/triton_model_repo &
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -0,0 +1,40 @@
 #!/bin/bash
 set -ex
 set -o pipefail
 main() {
    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
    (which jq) || (apt-get update && apt-get -y install jq)
    if [ ! -f /workspace/buildkite-agent ]; then
        echo "buildkite-agent binary not found. Skip plotting the results."
        exit 0
    fi
    # initial annotation
    description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
    # download results
    cd $VLLM_SOURCE_CODE_LOC/benchmarks
    mkdir -p results/
    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
    ls
    ls results/
    # generate figures
    python3 -m pip install tabulate pandas matplotlib
    python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
        --description $description \
        --results-folder results/
    # upload results and figures
    /workspace/buildkite-agent artifact upload "nightly_results.png"
    /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
    /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
 }
 main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -0,0 +1,135 @@
 import argparse
 import json
 import math
 from pathlib import Path
 import matplotlib.pyplot as plt
 import pandas as pd
 from tabulate import tabulate
 def parse_arguments():
    parser = argparse.ArgumentParser(
        description=
        'Parse command line arguments for summary-nightly-results script.')
    parser.add_argument('--results-folder',
                        type=str,
                        required=True,
                        help='The folder where the results are stored.')
    parser.add_argument('--description',
                        type=str,
                        required=True,
                        help='Description of the results.')
    args = parser.parse_args()
    return args
 def main(args):
    bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00']
    results_folder = Path(args.results_folder)
    results = []
    # collect results
    for test_file in results_folder.glob("*_nightly_results.json"):
        with open(test_file, "r") as f:
            results = results + json.loads(f.read())
    # generate markdown table
    df = pd.DataFrame.from_dict(results)
    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
    with open(args.description, "r") as f:
        description = f.read()
    description = description.format(
        nightly_results_benchmarking_table=md_table)
    with open("nightly_results.md", "w") as f:
        f.write(description)
    plt.rcParams.update({'font.size': 20})
    # plot results
    fig, axes = plt.subplots(3, 3, figsize=(16, 14))
    fig.subplots_adjust(hspace=1)
    methods = ["vllm", "trt", "lmdeploy", "tgi"]
    for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
        for j, metric in enumerate(["TTFT", "ITL"]):
            means, stds = [], []
            for method in methods:
                target = df['Test name'].str.contains(model)
                target = target & df['Engine'].str.contains(method)
                filtered_df = df[target]
                if filtered_df.empty:
                    means.append(0.)
                    stds.append(0.)
                else:
                    means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
                    std = filtered_df[f"Std {metric} (ms)"].values[0]
                    success = filtered_df["Successful req."].values[0]
                    stds.append(std / math.sqrt(success))
            print(model, metric)
            print(means, stds)
            ax = axes[i, j + 1]
            bars = ax.bar(
                ["vllm", "trt", "lmdeploy", "tgi"],
                means,
                yerr=stds,
                capsize=10,
            )
            for idx, bar in enumerate(bars):
                bar.set_color(bar_colors[idx])
            ax.set_ylim(bottom=0)
            ax.set_ylabel(f"{metric} (ms)")
            ax.set_title(f"{model} {metric}")
            ax.grid(axis='y')
        metric = "Tput"
        j = 0
        if True:
            tputs = []
            for method in methods:
                target = df['Test name'].str.contains(model)
                target = target & df['Engine'].str.contains(method)
                filtered_df = df[target]
                if filtered_df.empty:
                    tputs.append(0.)
                else:
                    input_tput = filtered_df["Input Tput (tok/s)"].values[0]
                    output_tput = filtered_df["Output Tput (tok/s)"].values[0]
                    tputs.append(input_tput + output_tput)
            print(model, metric)
            print(tputs)
            ax = axes[i, j]
            bars = ax.bar(
                ["vllm", "trt", "lmdeploy", "tgi"],
                tputs,
            )
            for idx, bar in enumerate(bars):
                bar.set_color(bar_colors[idx])
            ax.set_ylim(bottom=0)
            ax.set_ylabel("Tput (token/s)")
            ax.set_title(f"{model} {metric}")
            ax.grid(axis='y')
    fig.tight_layout()
    fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400)
 if __name__ == '__main__':
    args = parse_arguments()
    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -0,0 +1,218 @@
 #!/bin/bash
 set -o pipefail
 check_gpus() {
  # check the number of GPUs and GPU type.
  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  if [[ $gpu_count -gt 0 ]]; then
    echo "GPU found."
  else
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
  echo "GPU type is $gpu_type"
 }
 kill_gpu_processes() {
  pkill lmdeploy || true
  # waiting for GPU processes to be fully killed
  sleep 10
  # Print the GPU memory usage
  # so that we know if all GPU processes are killed.
  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
  # The memory usage should be 0 MB.
  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
 }
 json2args() {
  # transforms the JSON string to command line args, and '_' is replaced to '-'
  # example:
  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
  local json_string=$1
  local args=$(
    echo "$json_string" | jq -r '
      to_entries |
      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
      join(" ")
    '
  )
  echo "$args"
 }
 wait_for_server() {
  # wait for vllm server to start
  # return 1 if vllm server crashes
  timeout 1200 bash -c '
    until curl -s localhost:8000/v1/completions > /dev/null; do
      sleep 1
    done' && return 0 || return 1
 }
 run_serving_tests() {
  # run serving tests using `benchmark_serving.py`
  # $1: a json file specifying serving test cases
  local serving_test_file
  serving_test_file=$1
  # Iterate over serving tests
  jq -c '.[]' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # append lmdeploy to the test name
    test_name=lmdeploy_$test_name
    # get common parameters
    common_params=$(echo "$params" | jq -r '.common_parameters')
    model=$(echo "$common_params" | jq -r '.model')
    tp=$(echo "$common_params" | jq -r '.tp')
    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
    port=$(echo "$common_params" | jq -r '.port')
    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
    # get client and server arguments
    server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
    client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
    server_args=$(json2args "$server_params")
    client_args=$(json2args "$client_params")
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
    # check if there is enough GPU to run the test
    if [[ $gpu_count -lt $tp ]]; then
      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi
    # prepare tokenizer
    rm -rf /tokenizer_cache
    mkdir /tokenizer_cache
    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
      --model "$model" \
      --cachedir /tokenizer_cache
    server_command="lmdeploy serve api_server $model \
      --tp $tp \
      --server-port $port \
      $server_args"
    # run the server
    echo "Running test case $test_name"
    echo "Server command: $server_command"
    bash -c "$server_command" &
    # wait until the server is alive
    wait_for_server
    if [ $? -eq 0 ]; then
      echo ""
      echo "lmdeploy server is up and running."
    else
      echo ""
      echo "lmdeploy failed to start within the timeout period."
      break
    fi
    # get model name
    model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
        echo "qps was $qps"
        qps="inf"
        echo "now qps is $qps"
      fi
      new_test_name=$test_name"_qps_"$qps
      client_command="python3 benchmark_serving.py \
        --backend lmdeploy \
        --tokenizer /tokenizer_cache \
        --dataset-name $dataset_name \
        --dataset-path $dataset_path \
        --num-prompts $num_prompts \
        --port $port \
        --save-result \
        --result-dir $RESULTS_FOLDER \
        --result-filename ${new_test_name}.json \
        --request-rate $qps \
        --model \"$model_name\" \
        $client_args"
      echo "Running test case $test_name with qps $qps"
      echo "Client command: $client_command"
      eval "$client_command"
      # record the benchmarking commands
      jq_output=$(jq -n \
        --arg server "$server_command" \
        --arg client "$client_command" \
        --arg gpu "$gpu_type" \
        --arg engine "lmdeploy" \
        '{
          server_command: $server,
          client_command: $client,
          gpu_type: $gpu,
          engine: $engine
        }')
      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
    done
    # clean up
    kill_gpu_processes
    rm -rf /root/.cache/huggingface/*
  done
 }
 upload_to_buildkite() {
  # upload the benchmarking results to buildkite
  # if the agent binary is not found, skip uploading the results, exit 0
  if [ ! -f /workspace/buildkite-agent ]; then
    echo "buildkite-agent binary not found. Skip uploading the results."
    return 0
  fi
  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 main() {
  check_gpus
  # enter vllm directory
  cd $VLLM_SOURCE_CODE_LOC/benchmarks
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
  python -m pip install transformers==4.41.2
  export CURRENT_LLM_SERVING_ENGINE=lmdeploy
  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
  python -m pip install tabulate pandas
  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
  upload_to_buildkite
 }
 main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -34,6 +34,15 @@ check_hf_token() {
  fi
 }
 ensure_sharegpt_downloaded() {
  local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
  if [ ! -f "$FILE" ]; then
    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
  else
    echo "$FILE already exists."
  fi
 }
 json2args() {
  # transforms the JSON string to command line args, and '_' is replaced to '-'
  # example:
@@ -54,48 +63,62 @@ wait_for_server() {
  # wait for vllm server to start
  # return 1 if vllm server crashes
  timeout 1200 bash -c '
-    until curl localhost:8000/v1/completions; do
+    until curl -X POST localhost:8000/v1/completions; do
      sleep 1
    done' && return 0 || return 1
 }
-kill_gpu_processes() {
+kill_processes_launched_by_current_bash() {
-  # kill all processes on GPU.
+  # Kill all python processes launched from current bash script
-  pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
+  current_shell_pid=$$
-  if [ -z "$pids" ]; then
+  processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}')
-      echo "No GPU processes found."
+  if [ -n "$processes" ]; then
    echo "Killing the following processes matching '$1':"
    echo "$processes"
    echo "$processes" | xargs kill -9
  else
-      for pid in $pids; do
+    echo "No processes found matching '$1'."
          kill -9 "$pid"
          echo "Killed process with PID: $pid"
      done
      echo "All GPU processes have been killed."
  fi
 }
-  # waiting for GPU processes to be fully killed
+kill_gpu_processes() {
-  sleep 10
+
  ps -aux
  lsof -t -i:8000 | xargs -r kill -9
  pkill -f pt_main_thread
  # this line doesn't work now
  # ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
  pkill -f python3
  pkill -f /usr/bin/python3
  # wait until GPU memory usage smaller than 1GB
  while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
    sleep 1
  done
  # remove vllm config file
  rm -rf ~/.config/vllm
  # Print the GPU memory usage
  # so that we know if all GPU processes are killed.
  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
  # The memory usage should be 0 MB.
  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
 }
 upload_to_buildkite() {
  # upload the benchmarking results to buildkite
  # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
+  # Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
  if command -v buildkite-agent >/dev/null 2>&1; then
    BUILDKITE_AGENT_COMMAND="buildkite-agent"
  elif [ -f /workspace/buildkite-agent ]; then
    BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
  else
    echo "buildkite-agent binary not found. Skip uploading the results."
    return 0
  fi
-  /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
+
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+  # Use the determined command to annotate and upload artifacts
  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }
 run_latency_tests() {
@@ -146,7 +169,7 @@ run_latency_tests() {
        latency_command: $latency,
        gpu_type: $gpu
      }')
-    echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
+    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
    # run the benchmark
    eval "$latency_command"
@@ -156,7 +179,6 @@ run_latency_tests() {
  done
 }
 run_throughput_tests() {
  # run throughput tests using `benchmark_throughput.py`
  # $1: a json file specifying throughput test cases
@@ -204,7 +226,7 @@ run_throughput_tests() {
        throughput_command: $command,
        gpu_type: $gpu
      }')
-    echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
+    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
    # run the benchmark
    eval "$throughput_command"
@@ -236,7 +258,6 @@ run_serving_tests() {
      continue
    fi
    # get client and server arguments
    server_params=$(echo "$params" | jq -r '.server_parameters')
    client_params=$(echo "$params" | jq -r '.client_parameters')
@@ -269,6 +290,7 @@ run_serving_tests() {
    echo "Running test case $test_name"
    echo "Server command: $server_command"
    eval "$server_command" &
    server_pid=$!
    # wait until the server is alive
    wait_for_server
@@ -313,11 +335,12 @@ run_serving_tests() {
          client_command: $client,
          gpu_type: $gpu
        }')
-      echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
    done
    # clean up
    kill -9 $server_pid
    kill_gpu_processes
  done
 }
@@ -329,6 +352,7 @@ main() {
  # dependencies
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
  (which jq) || (apt-get update && apt-get -y install jq)
  (which lsof) || (apt-get update && apt-get install -y lsof)
  # get the current IP address, required by benchmark_serving.py
  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
@@ -337,7 +361,7 @@ main() {
  # prepare for benchmarking
  cd benchmarks || exit 1
-  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+  ensure_sharegpt_downloaded
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
@@ -347,7 +371,6 @@ main() {
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
  # postprocess benchmarking results
  pip install tabulate pandas
  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@@ -0,0 +1,216 @@
 #!/bin/bash
 set -o pipefail
 check_gpus() {
  # check the number of GPUs and GPU type.
  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  if [[ $gpu_count -gt 0 ]]; then
    echo "GPU found."
  else
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
  echo "GPU type is $gpu_type"
 }
 kill_gpu_processes() {
  pkill text-generation || true
  # waiting for GPU processes to be fully killed
  sleep 10
  # Print the GPU memory usage
  # so that we know if all GPU processes are killed.
  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
  # The memory usage should be 0 MB.
  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
 }
 json2args() {
  # transforms the JSON string to command line args, and '_' is replaced to '-'
  # example:
  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
  local json_string=$1
  local args=$(
    echo "$json_string" | jq -r '
      to_entries |
      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
      join(" ")
    '
  )
  echo "$args"
 }
 wait_for_server() {
  timeout 1200 bash -c '
    until curl -s localhost:8000/generate_stream > /dev/null; do
      sleep 1
    done' && return 0 || return 1
 }
 run_serving_tests() {
  # run serving tests using `benchmark_serving.py`
  # $1: a json file specifying serving test cases
  local serving_test_file
  serving_test_file=$1
  # Iterate over serving tests
  jq -c '.[]' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # append tgi to the test name
    test_name=tgi_$test_name
    # get common parameters
    common_params=$(echo "$params" | jq -r '.common_parameters')
    model=$(echo "$common_params" | jq -r '.model')
    tp=$(echo "$common_params" | jq -r '.tp')
    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
    port=$(echo "$common_params" | jq -r '.port')
    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
    # get client and server arguments
    server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
    client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
    server_args=$(json2args "$server_params")
    client_args=$(json2args "$client_params")
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
    # check if there is enough GPU to run the test
    if [[ $gpu_count -lt $tp ]]; then
      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi
    if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
      echo "Key 'fp8' exists in common params."
      server_command="/tgi-entrypoint.sh \
        --model-id $model \
        --num-shard $tp \
        --port $port \
        --quantize fp8 \
        $server_args"
    else
      echo "Key 'fp8' does not exist in common params."
      server_command="/tgi-entrypoint.sh \
        --model-id $model \
        --num-shard $tp \
        --port $port \
        $server_args"
    fi
    # run the server
    echo "Running test case $test_name"
    echo "Server command: $server_command"
    eval "$server_command" &
    # wait until the server is alive
    wait_for_server
    if [ $? -eq 0 ]; then
      echo ""
      echo "tgi server is up and running."
    else
      echo ""
      echo "tgi failed to start within the timeout period."
      break
    fi
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
        echo "qps was $qps"
        qps="inf"
        echo "now qps is $qps"
      fi
      new_test_name=$test_name"_qps_"$qps
      client_command="python3 benchmark_serving.py \
        --backend tgi \
        --model $model \
        --dataset-name $dataset_name \
        --dataset-path $dataset_path \
        --num-prompts $num_prompts \
        --port $port \
        --save-result \
        --result-dir $RESULTS_FOLDER \
        --result-filename ${new_test_name}.json \
        --request-rate $qps \
        $client_args"
      echo "Running test case $test_name with qps $qps"
      echo "Client command: $client_command"
      eval "$client_command"
      # record the benchmarking commands
      jq_output=$(jq -n \
        --arg server "$server_command" \
        --arg client "$client_command" \
        --arg gpu "$gpu_type" \
        --arg engine "tgi" \
        '{
          server_command: $server,
          client_command: $client,
          gpu_type: $gpu,
          engine: $engine
        }')
      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
    done
    # clean up
    kill_gpu_processes
    rm -rf /root/.cache/huggingface/*
  done
 }
 upload_to_buildkite() {
  # upload the benchmarking results to buildkite
  # if the agent binary is not found, skip uploading the results, exit 0
  if [ ! -f /workspace/buildkite-agent ]; then
    echo "buildkite-agent binary not found. Skip uploading the results."
    return 0
  fi
  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 main() {
  check_gpus
  # enter vllm directory
  cd $VLLM_SOURCE_CODE_LOC/benchmarks
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
  export CURRENT_LLM_SERVING_ENGINE=tgi
  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
  python -m pip install tabulate pandas
  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
  upload_to_buildkite
 }
 main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -0,0 +1,214 @@
 #!/bin/bash
 set -o pipefail
 check_gpus() {
  # check the number of GPUs and GPU type.
  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  if [[ $gpu_count -gt 0 ]]; then
    echo "GPU found."
  else
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
  echo "GPU type is $gpu_type"
 }
 kill_gpu_processes() {
  pkill tritonserver || true
  # waiting for GPU processes to be fully killed
  sleep 20
  # Print the GPU memory usage
  # so that we know if all GPU processes are killed.
  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
  # The memory usage should be 0 MB.
  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
 }
 json2args() {
  # transforms the JSON string to command line args, and '_' is replaced to '-'
  # example:
  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
  local json_string=$1
  local args=$(
    echo "$json_string" | jq -r '
      to_entries |
      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
      join(" ")
    '
  )
  echo "$args"
 }
 wait_for_server() {
  timeout 1200 bash -c '
    until curl -s localhost:8000/generate_stream > /dev/null; do
      sleep 1
    done' && return 0 || return 1
 }
 run_serving_tests() {
  # run serving tests using `benchmark_serving.py`
  # $1: a json file specifying serving test cases
  local serving_test_file
  serving_test_file=$1
  # Iterate over serving tests
  jq -c '.[]' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # append trt to the test name
    test_name=trt_$test_name
    # get common parameters
    common_params=$(echo "$params" | jq -r '.common_parameters')
    model=$(echo "$common_params" | jq -r '.model')
    tp=$(echo "$common_params" | jq -r '.tp')
    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
    port=$(echo "$common_params" | jq -r '.port')
    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
    # get client and server arguments
    server_params=$(echo "$params" | jq -r '.trt_server_parameters')
    client_params=$(echo "$params" | jq -r '.trt_client_parameters')
    client_args=$(json2args "$client_params")
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
    # check if there is enough GPU to run the test
    if [[ $gpu_count -lt $tp ]]; then
      echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi
    cd $VLLM_SOURCE_CODE_LOC/benchmarks
    echo "Running test case $test_name"
    bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
    # wait until the server is alive
    wait_for_server
    if [ $? -eq 0 ]; then
      echo ""
      echo "trt server is up and running."
    else
      echo ""
      echo "trt failed to start within the timeout period."
      break
    fi
    # prepare tokenizer
    cd $VLLM_SOURCE_CODE_LOC/benchmarks
    rm -rf /tokenizer_cache
    mkdir /tokenizer_cache
    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
      --model "$model" \
      --cachedir /tokenizer_cache
    cd $VLLM_SOURCE_CODE_LOC/benchmarks
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
        echo "qps was $qps"
        qps="inf"
        echo "now qps is $qps"
      fi
      new_test_name=$test_name"_qps_"$qps
      client_command="python3 benchmark_serving.py \
        --backend tensorrt-llm \
        --tokenizer /tokenizer_cache \
        --model $model \
        --dataset-name $dataset_name \
        --dataset-path $dataset_path \
        --num-prompts $num_prompts \
        --port $port \
        --save-result \
        --result-dir $RESULTS_FOLDER \
        --result-filename ${new_test_name}.json \
        --request-rate $qps \
        $client_args"
      echo "Running test case $test_name with qps $qps"
      echo "Client command: $client_command"
      eval "$client_command"
      server_command=""
      # record the benchmarking commands
      jq_output=$(jq -n \
        --arg server "$server_command" \
        --arg client "$client_command" \
        --arg gpu "$gpu_type" \
        --arg engine "trt" \
        '{
          server_command: $server,
          client_command: $client,
          gpu_type: $gpu,
          engine: $engine
        }')
      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
    done
    # clean up
    kill_gpu_processes
    rm -rf /root/.cache/huggingface/*
  done
 }
 upload_to_buildkite() {
  # upload the benchmarking results to buildkite
  # if the agent binary is not found, skip uploading the results, exit 0
  if [ ! -f /workspace/buildkite-agent ]; then
    echo "buildkite-agent binary not found. Skip uploading the results."
    return 0
  fi
  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 main() {
  check_gpus
  # enter vllm directory
  cd $VLLM_SOURCE_CODE_LOC/benchmarks
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
  # update transformers package, to make sure mixtral tokenizer is available
  python -m pip install transformers -U
  export CURRENT_LLM_SERVING_ENGINE=trt
  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
  python -m pip install tabulate pandas
  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
  upload_to_buildkite
 }
 main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@@ -0,0 +1,221 @@
 #!/bin/bash
 set -o pipefail
 check_gpus() {
  # check the number of GPUs and GPU type.
  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  if [[ $gpu_count -gt 0 ]]; then
    echo "GPU found."
  else
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
  echo "GPU type is $gpu_type"
 }
 kill_gpu_processes() {
  # kill all processes on GPU.
  pkill pt_main_thread
  sleep 10
  # remove vllm config file
  rm -rf ~/.config/vllm
  # Print the GPU memory usage
  # so that we know if all GPU processes are killed.
  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
  # The memory usage should be 0 MB.
  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
 }
 json2args() {
  # transforms the JSON string to command line args, and '_' is replaced to '-'
  # example:
  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
  local json_string=$1
  local args=$(
    echo "$json_string" | jq -r '
      to_entries |
      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
      join(" ")
    '
  )
  echo "$args"
 }
 wait_for_server() {
  # wait for vllm server to start
  # return 1 if vllm server crashes
  timeout 1200 bash -c '
    until curl -s localhost:8000/v1/completions > /dev/null; do
      sleep 1
    done' && return 0 || return 1
 }
 run_serving_tests() {
  # run serving tests using `benchmark_serving.py`
  # $1: a json file specifying serving test cases
  local serving_test_file
  serving_test_file=$1
  # Iterate over serving tests
  jq -c '.[]' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # append vllm to the test name
    test_name=vllm_$test_name
    # get common parameters
    common_params=$(echo "$params" | jq -r '.common_parameters')
    model=$(echo "$common_params" | jq -r '.model')
    tp=$(echo "$common_params" | jq -r '.tp')
    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
    port=$(echo "$common_params" | jq -r '.port')
    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
    # get client and server arguments
    server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
    client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
    server_args=$(json2args "$server_params")
    client_args=$(json2args "$client_params")
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
    # check if there is enough GPU to run the test
    if [[ $gpu_count -lt $tp ]]; then
      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi
    if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
      echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
      model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
      server_command="python3 \
        -m vllm.entrypoints.openai.api_server \
        -tp $tp \
        --model $model \
        --port $port \
        $server_args"
    else
      echo "Key 'fp8' does not exist in common params."
      server_command="python3 \
        -m vllm.entrypoints.openai.api_server \
        -tp $tp \
        --model $model \
        --port $port \
        $server_args"
    fi
    # run the server
    echo "Running test case $test_name"
    echo "Server command: $server_command"
    eval "$server_command" &
    # wait until the server is alive
    wait_for_server
    if [ $? -eq 0 ]; then
      echo ""
      echo "vllm server is up and running."
    else
      echo ""
      echo "vllm failed to start within the timeout period."
      break
    fi
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
        echo "qps was $qps"
        qps="inf"
        echo "now qps is $qps"
      fi
      new_test_name=$test_name"_qps_"$qps
      client_command="python3 benchmark_serving.py \
        --backend vllm \
        --model $model \
        --dataset-name $dataset_name \
        --dataset-path $dataset_path \
        --num-prompts $num_prompts \
        --port $port \
        --save-result \
        --result-dir $RESULTS_FOLDER \
        --result-filename ${new_test_name}.json \
        --request-rate $qps \
        $client_args"
      echo "Running test case $test_name with qps $qps"
      echo "Client command: $client_command"
      eval "$client_command"
      # record the benchmarking commands
      jq_output=$(jq -n \
        --arg server "$server_command" \
        --arg client "$client_command" \
        --arg gpu "$gpu_type" \
        --arg engine "vllm" \
        '{
          server_command: $server,
          client_command: $client,
          gpu_type: $gpu,
          engine: $engine
        }')
      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
    done
    # clean up
    kill_gpu_processes
    rm -rf /root/.cache/huggingface/*
  done
 }
 upload_to_buildkite() {
  # upload the benchmarking results to buildkite
  # if the agent binary is not found, skip uploading the results, exit 0
  if [ ! -f /workspace/buildkite-agent ]; then
    echo "buildkite-agent binary not found. Skip uploading the results."
    return 0
  fi
  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 main() {
  check_gpus
  # enter vllm directory
  cd $VLLM_SOURCE_CODE_LOC/benchmarks
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
  export CURRENT_LLM_SERVING_ENGINE=vllm
  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
  python3 -m pip install tabulate pandas
  python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
  upload_to_buildkite
 }
 main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -0,0 +1,76 @@
 import datetime
 import json
 import os
 from pathlib import Path
 import pandas as pd
 from tabulate import tabulate
 results_folder = Path("results/")
 # serving results and the keys that will be printed into markdown
 serving_results = []
 serving_column_mapping = {
    "test_name": "Test name",
    "gpu_type": "GPU",
    "completed": "Successful req.",
    "request_throughput": "Tput (req/s)",
    "mean_ttft_ms": "Mean TTFT (ms)",
    "std_ttft_ms": "Std TTFT (ms)",
    "mean_itl_ms": "Mean ITL (ms)",
    "std_itl_ms": "Std ITL (ms)",
    "input_throughput": "Input Tput (tok/s)",
    "output_throughput": "Output Tput (tok/s)",
    "engine": "Engine",
 }
 if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):
        with open(test_file, "r") as f:
            raw_result = json.loads(f.read())
        # attach the benchmarking command to raw_result
        with open(test_file.with_suffix(".commands"), "r") as f:
            command = json.loads(f.read())
        raw_result.update(command)
        # update the test name of this result
        raw_result.update({"test_name": test_file.stem})
        # add the result to raw_result
        serving_results.append(raw_result)
        continue
    serving_results = pd.DataFrame.from_dict(serving_results)
    if not serving_results.empty:
        serving_results = serving_results[list(
            serving_column_mapping.keys())].rename(
                columns=serving_column_mapping)
    serving_md_table_with_headers = tabulate(serving_results,
                                             headers='keys',
                                             tablefmt='pipe',
                                             showindex=False)
    # remove the first line of header
    serving_md_table_lines = serving_md_table_with_headers.split('\n')
    serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
    # document benchmarking results in markdown
    with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
        # document results with header.
        # for those who wants to reproduce our benchmark.
        f.write(serving_md_table_with_headers)
        f.write('\n')
    # document benchmarking results in json
    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
        results = serving_results.to_dict(orient='records')
        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/tests/latency-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json
@@ -2,7 +2,7 @@
    {
        "test_name": "latency_llama8B_tp1",
        "parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "num_iters_warmup": 5,
@@ -12,7 +12,7 @@
    {
        "test_name": "latency_llama70B_tp4",
        "parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
            "load_format": "dummy",
            "num-iters-warmup": 5,
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -0,0 +1,116 @@
 [
    {
        "test_name": "llama8B_tp1",
        "qps_list": [4],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B",
            "tp": 1,
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 500,
            "port": 8000
        },
        "lmdeploy_server_parameters": {
        },
        "lmdeploy_client_parameters": {
        },
        "tgi_server_parameters": {
        },
        "tgi_client_parameters": {
            "endpoint": "/generate_stream"
        },
        "trt_server_parameters": {
            "model_type": "llama",
            "model_dtype": "float16",
            "max_batch_size": 256,
            "max_input_len": 4096,
            "max_output_len": 4096,
            "trt_llm_version": "r24.04"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        },
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "disable_log_requests": ""
        },
        "vllm_client_parameters": {
        }
    },
    {
        "test_name": "llama70B_tp4",
        "qps_list": [2],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "tp": 4,
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 500,
            "port": 8000
        },
        "lmdeploy_server_parameters": {
        },
        "lmdeploy_client_parameters": {
        },
        "tgi_server_parameters": {
        },
        "tgi_client_parameters": {
            "endpoint": "/generate_stream"
        },
        "trt_server_parameters": {
            "model_type": "llama",
            "model_dtype": "float16",
            "max_batch_size": 256,
            "max_input_len": 4096,
            "max_output_len": 4096,
            "trt_llm_version": "r24.04"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        },
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "disable_log_requests": ""
        },
        "vllm_client_parameters": {
        }
    },
    {
        "test_name": "mixtral8x7B_tp2",
        "qps_list": [2],
        "common_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tp": 2,
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 500,
            "port": 8000
        },
        "lmdeploy_server_parameters": {
        },
        "lmdeploy_client_parameters": {
        },
        "tgi_server_parameters": {
        },
        "tgi_client_parameters": {
            "endpoint": "/generate_stream"
        },
        "trt_server_parameters": {
            "model_type": "llama",
            "model_dtype": "float16",
            "max_batch_size": 256,
            "max_input_len": 4096,
            "max_output_len": 4096,
            "trt_llm_version": "r24.04"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        },
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "disable_log_requests": ""
        },
        "vllm_client_parameters": {
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@@ -3,7 +3,7 @@
        "test_name": "serving_llama8B_tp1_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "swap_space": 16,
            "disable_log_stats": "",
@@ -11,7 +11,7 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@@ -22,7 +22,7 @@
        "test_name": "serving_llama70B_tp4_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
            "swap_space": 16,
            "disable_log_stats": "",
@@ -30,7 +30,7 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@@ -55,5 +55,26 @@
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
        "qps_list": [2],
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "disable_log_requests": "", 
            "tensor_parallel_size": 4,
            "swap_space": 16, 
            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
            "num_speculative_tokens": 4,
            "speculative_draft_tensor_parallel_size": 1,
            "use_v2_block_manager": ""
        },
        "client_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200 
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
@@ -2,7 +2,7 @@
    {
        "test_name": "throughput_llama8B_tp1",
        "parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@@ -13,7 +13,7 @@
    {
        "test_name": "throughput_llama70B_tp4",
        "parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,21 +1,32 @@
 steps:
-  - block: "Build wheels"
+  - label: "Build wheel - CUDA 12.1"
  - label: "Build wheel - Python {{matrix.python_version}}, CUDA {{matrix.cuda_version}}" 
    agents:
      queue: cpu_queue
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      # rename the files to change linux -> manylinux1
      - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
-    matrix:
+      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
-      setup:
+    env:
-        cuda_version:
+      DOCKER_BUILDKIT: "1"
-          - "11.8.0"
+
-          - "12.1.0"
+  - block: "Build CUDA 11.8 wheel"
-        python_version:
+    key: block-build-cu118-wheel
-          - "3.8"
+  
-          - "3.9"
+  - label: "Build wheel - CUDA 11.8"
-          - "3.10"
+    depends_on: block-build-cu118-wheel
-          - "3.11"
+    agents:
      queue: cpu_queue
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      # rename the files to change linux -> manylinux1
      - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
    env:
      DOCKER_BUILDKIT: "1"
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -2,6 +2,15 @@
 set -ex
 # Print ROCm version
 echo "--- Confirming Clean Initial State"
 while true; do
        sleep 3
        if grep -q clean /opt/amdgpu/etc/gpu_state; then
                echo "GPUs state is \"clean\""
                break
        fi
 done
 echo "--- ROCm info"
 rocminfo
@@ -45,15 +54,10 @@ while true; do
        fi
 done
-echo "--- Building container"
+echo "--- Pulling container" 
-sha=$(git rev-parse --short HEAD)
+image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
-image_name=rocm_${sha}
+container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
+docker pull ${image_name}
 docker build \
        -t ${image_name} \
        -f Dockerfile.rocm \
        --progress plain \
        .
 remove_docker_container() {
   docker rm -f ${container_name} || docker image rm -f ${image_name} || true
@@ -62,11 +66,18 @@ trap remove_docker_container EXIT
 echo "--- Running container"
 HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p ${HF_CACHE}
 HF_MOUNT="/root/.cache/huggingface"
 docker run \
        --device /dev/kfd --device /dev/dri \
        --network host \
        --shm-size=16gb \
        --rm \
        -e HF_TOKEN \
        -v ${HF_CACHE}:${HF_MOUNT} \
        -e HF_HOME=${HF_MOUNT} \
        --name ${container_name} \
        ${image_name} \
        /bin/bash -c "${@}"
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -3,26 +3,38 @@
 set -ex
 # Try building the docker image
-docker build -t cpu-test -f Dockerfile.cpu .
+numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu .
-docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
+numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
 # Setup cleanup
 remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
 trap remove_docker_container EXIT
 remove_docker_container
-# Run the image
+# Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
-  --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
+ --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
-  --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2
+ --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
 # offline inference
 docker exec cpu-test bash -c "python3 examples/offline_inference.py"
 docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 # Run basic model test
-docker exec cpu-test bash -c "cd tests;
+docker exec cpu-test bash -c "
-  pip install pytest Pillow protobuf
+  pip install pytest matplotlib einops transformers_stream_generator
-  cd ../
+  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
-  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported
+
 # online inference
 docker exec cpu-test bash -c "
  export VLLM_CPU_KVCACHE_SPACE=10 
  export VLLM_CPU_OMP_THREADS_BIND=48-92 
  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
  timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
  python3 benchmarks/benchmark_serving.py \
    --backend vllm \
    --dataset-name random \
    --model facebook/opt-125m \
    --num-prompts 20 \
    --endpoint /v1/completions \
    --tokenizer facebook/opt-125m"
--- a/.buildkite/run-multi-node-test.sh
+++ b/.buildkite/run-multi-node-test.sh
@@ -0,0 +1,105 @@
 #!/bin/bash
 set -euox pipefail
 if [[ $# -lt 4 ]]; then
    echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
    exit 1
 fi
 WORKING_DIR=$1
 NUM_NODES=$2
 NUM_GPUS=$3
 DOCKER_IMAGE=$4
 shift 4
 COMMANDS=("$@")
 if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
    echo "The number of commands must be equal to the number of nodes."
    echo "Number of nodes: $NUM_NODES"
    echo "Number of commands: ${#COMMANDS[@]}"
    exit 1
 fi
 echo "List of commands"
 for command in "${COMMANDS[@]}"; do
    echo $command
 done
 start_network() {
    docker network create --subnet=192.168.10.0/24 docker-net
 }
 start_nodes() {
    for node in $(seq 0 $(($NUM_NODES-1))); do
        GPU_DEVICES='"device='
        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
            GPU_DEVICES+=$(($DEVICE_NUM))
            if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
                GPU_DEVICES+=','
            fi
        done
        GPU_DEVICES+='"'
        # start the container in detached mode
        # things to note:
        # 1. --shm-size=10.24gb is required. don't use --ipc=host
        # 2. pass HF_TOKEN to the container
        # 3. map the huggingface cache directory to the container
        # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
        #    starting from 192.168.10.11)
        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
        # organize containers into a ray cluster
        if [ $node -eq 0 ]; then
            # start the ray head node
            docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
            # wait for the head node to be ready
            sleep 10
        else
            # start the ray worker nodes, and connect them to the head node
            docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
        fi
    done
    # wait for the cluster to be ready
    sleep 10
    # print the cluster status
    docker exec node0 /bin/bash -c "ray status"
 }
 run_nodes() {
    # important: iterate in reverse order to start the head node last
    # we start the worker nodes first, in detached mode, and then start the head node
    # in the foreground, so that the output of the head node is visible in the buildkite logs
    for node in $(seq $(($NUM_NODES - 1)) -1 0); do
        GPU_DEVICES='"device='
        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
            GPU_DEVICES+=$(($DEVICE_NUM))
            if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
                GPU_DEVICES+=','
            fi
        done
        GPU_DEVICES+='"'
        echo "Running node$node with GPU devices: $GPU_DEVICES"
        if [ $node -ne 0 ]; then
            docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
        else
            docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
        fi
    done
 }
 cleanup() {
    for node in $(seq 0 $(($NUM_NODES-1))); do
        docker stop node$node
    done
    docker network rm docker-net
 }
 trap cleanup EXIT
 start_network
 start_nodes
 run_nodes
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@@ -0,0 +1,16 @@
 set -e
 # Build the docker image.
 docker build -f Dockerfile.tpu -t vllm-tpu .
 # Set up cleanup.
 remove_docker_container() { docker rm -f tpu-test || true; }
 trap remove_docker_container EXIT
 # Remove the container that might not be cleaned up in the previous run.
 remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
 docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu \
    python3 /workspace/vllm/examples/offline_inference_tpu.py
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -5,239 +5,397 @@
 # https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 
 # to generate the final pipeline yaml file.
 # Documentation
 # label(str): the name of the test. emoji allowed.
 # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
 # fast_check_only(bool): run this test on fastcheck pipeline only
 # command(str): the single command to run for tests. incompatible with commands.
 # commands(list): the list of commands to run for test. incompatbile with command.
 # mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
 # gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
 # num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
 # num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host, 
 #     in this case, commands must be specified. the first command runs on first host, the second
 #     command runs on the second host.
 # working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
 # source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
 # When adding a test
 # - If the test belong to an existing group, add it there
 # - If the test is short, add to any existing step
 # - If the test takes more than 10min, then it is okay to create a new step. 
 #   Note that all steps execute in parallel. 
 steps:
- label: Regression Test
+##### fast check tests  #####
  mirror_hardwares: [amd]
  command: pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional
- label: AsyncEngine Test
+- label: Documentation Build # 2min
-  #mirror_hardwares: [amd]
+  working_dir: "/vllm-workspace/test_docs/docs"
-  command: pytest -v -s async_engine
+  fast_check: true
-
+  no_gpu: True
 - label: Basic Correctness Test
  mirror_hardwares: [amd]
  commands:
-  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
+  - pip install -r requirements-docs.txt
-  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
+  - SPHINXOPTS=\"-W\" make html
  # Check API reference (if it fails, you may have missing mock imports)
  - grep \"sig sig-object py\" build/html/dev/sampling_params.html
 - label: Async Engine, Inputs, Utils, Worker Test # 15min
  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/async_engine
  - tests/test_inputs
  - tests/multimodal
  - tests/test_utils
  - tests/worker
  commands:
  - pytest -v -s async_engine # Async Engine
  - pytest -v -s test_inputs.py
  - pytest -v -s multimodal
  - pytest -v -s test_utils.py # Utils
  - pytest -v -s worker # Worker
 - label: Basic Correctness Test # 30min
  #mirror_hardwares: [amd]
  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/basic_correctness
  commands:
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Core Test
+- label: Core Test # 10min
  mirror_hardwares: [amd]
  fast_check: true
  source_file_dependencies:
  - vllm/core
  - vllm/distributed
  - tests/core
  commands:
  - pytest -v -s core
  - pytest -v -s distributed/test_parallel_state.py
- label: Distributed Comm Ops Test
+- label: Entrypoints Test # 20min
  working_dir: "/vllm-workspace/tests"
  fast_check: true
  #mirror_hardwares: [amd]
-  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
-  num_gpus: 2
+  - vllm/
  commands:
  - pytest -v -s distributed/test_comm_ops.py
  - pytest -v -s distributed/test_shm_broadcast.py
 - label: Distributed Tests (2 GPUs)
  mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  commands:
  - bash ../.buildkite/download-images.sh
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
 - label: Distributed Tests (4 GPUs)
  #mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  commands:
  - pytest -v -s distributed/test_pynccl.py
  # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
  # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
 - label: Pipeline Parallelism Test
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  commands:
  - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
  - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
  - TP_SIZE=1 PP_SIZE=3 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
  - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
  - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
 - label: Engine Test
  mirror_hardwares: [amd]
  command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
 - label: Entrypoints Test
  mirror_hardwares: [amd]
  commands:
  - pip install -e ./plugins/vllm_add_dummy_model
  - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
  - pytest -v -s entrypoints/llm
  - pytest -v -s entrypoints/openai
- label: Examples Test
+- label: Distributed Tests (4 GPUs) # 10min
-  working_dir: "/vllm-workspace/examples"
+  working_dir: "/vllm-workspace/tests"
-  mirror_hardwares: [amd]
+  num_gpus: 4
  fast_check: true
  source_file_dependencies:
  - vllm/distributed/
  - vllm/core/
  - tests/distributed
  - tests/spec_decode/e2e/test_integration_dist_tp4
  commands:
-    # install aws cli for llava_example.py
+  - pytest -v -s distributed/test_pynccl.py
-    # install tensorizer for tensorize_vllm_model.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
-    - pip install awscli tensorizer
+
 - label: Metrics, Tracing Test # 10min
  num_gpus: 2 
  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/metrics
  - tests/tracing
  commands:
  - pytest -v -s metrics 
  - "pip install \
      'opentelemetry-sdk>=1.26.0,<1.27.0' \
      'opentelemetry-api>=1.26.0,<1.27.0' \
      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
  - pytest -v -s tracing
 ##### fast check tests  #####
 #####  1 GPU test  #####
 - label: Regression Test # 5min
  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/test_regression
  command: pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional
 - label: Engine Test # 10min
  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/engine
  - tests/tokenization
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py
  # OOM in the CI unless we run this separately
  - pytest -v -s tokenization
 - label: Examples Test # 12min
  working_dir: "/vllm-workspace/examples"
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/entrypoints
  - examples/
  commands:
    - pip install awscli tensorizer # for llava example and tensorizer test
    - python3 offline_inference.py
    - python3 cpu_offload.py
    - python3 offline_inference_chat.py
    - python3 offline_inference_with_prefix.py
    - python3 llm_engine_example.py
-    - python3 llava_example.py
+    - python3 offline_inference_vision_language.py
    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference_encoder_decoder.py
- label: Inputs Test
+- label: Models Test # 1hr10min
  source_file_dependencies:
  - vllm/
  - tests/models
  commands:
    - pip install -e ./plugins/vllm_add_dummy_model
    - pytest -v -s models/test_oot_registration.py # it needs a clean process
    - pytest -v -s models -m \"not vlm\" --ignore=models/test_oot_registration.py
 - label: torch compile integration test
  source_file_dependencies:
  - vllm/
  commands:
    - pytest -v -s ./compile/test_full_graph.py
 - label: Vision Language Models Test # 42min
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  commands:
    - bash ../.buildkite/download-images.sh
    - pytest -v -s test_inputs.py
    - pytest -v -s multimodal
 - label: Kernels Test %N
  #mirror_hardwares: [amd]
  commands:
    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 4
 - label: Models Test
  #mirror_hardwares: [amd]
  commands:
    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
    - pytest -v -s models -m \"not vlm\"
 - label: Vision Language Models Test
  mirror_hardwares: [amd]
  commands:
    - bash ../.buildkite/download-images.sh
    - pytest -v -s models -m vlm
- label: Prefix Caching Test
+- label: Prefix Caching Test # 7min
-  mirror_hardwares: [amd]
+  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/prefix_caching
  commands:
    - pytest -v -s prefix_caching
- label: Samplers Test
+- label: Samplers Test # 18min
-  #mirror_hardwares: [amd]
+  source_file_dependencies:
-  command: pytest -v -s samplers
+  - vllm/model_executor/layers
  - vllm/sampling_metadata.py
  - tests/samplers
  commands:
    - pytest -v -s samplers
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
- label: LogitsProcessor Test
+- label: LogitsProcessor Test # 5min
  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/model_executor/layers
  - tests/test_logits_processor
  command: pytest -v -s test_logits_processor.py
- label: Utils Test
+- label: Speculative decoding tests # 22min
-  command: pytest -v -s test_utils.py
+  source_file_dependencies:
-
+  - vllm/spec_decode
- label: Worker Test
+  - tests/spec_decode
  mirror_hardwares: [amd]
  command: pytest -v -s worker
 - label: Speculative decoding tests
  #mirror_hardwares: [amd]
  commands:
    # See https://github.com/vllm-project/vllm/issues/5152
    - export VLLM_ATTENTION_BACKEND=XFORMERS
    - pytest -v -s spec_decode
- label: LoRA Test %N
+- label: LoRA Test %N # 30min each
-  #mirror_hardwares: [amd]
+  source_file_dependencies:
  - vllm/lora
  - csrc/punica
  - tests/lora
  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
  parallelism: 4
- label: LoRA Long Context (Distributed)
+- label: Kernels Test %N # 30min each
  source_file_dependencies:
  - csrc/
  - vllm/attention
  - tests/kernels
  commands:
    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 4
 - label: Tensorizer Test # 11min
  soft_fail: true
  source_file_dependencies:
  - vllm/model_executor/model_loader
  - tests/tensorizer_loader
  commands:
    - apt-get install -y curl libsodium23
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s tensorizer_loader
 - label: Benchmarks # 9min
  working_dir: "/vllm-workspace/.buildkite"
  mirror_hardwares: [amd]
  source_file_dependencies:
  - benchmarks/
  commands:
  - pip install aiohttp
  - bash run-benchmarks.sh
 - label: Quantization Test # 15min
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - tests/quantization
  command: pytest -v -s quantization
 - label: LM Eval Small Models # 53min
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pip install lm-eval
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - bash ./run-tests.sh -c configs/models-small.txt -t 1
 #####  1 GPU test  #####
 #####  multi gpus test  #####
 - label: Distributed Comm Ops Test # 7min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/distributed
  - tests/distributed
  commands:
  - pytest -v -s distributed/test_comm_ops.py
  - pytest -v -s distributed/test_shm_broadcast.py
 - label: 2 Node Tests (4 GPUs in total) # 16min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  num_nodes: 2
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/model_executor/models/
  - tests/distributed/
  commands:
  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
 - label: Distributed Tests (2 GPUs) # 28min
  #mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/model_executor/models/
  - tests/distributed/
  commands:
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
  - TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
  - pytest -v -s distributed/test_basic_distributed_correctness_enc_dec.py
  - pytest -v -s distributed/test_chunked_prefill_distributed.py
  - pytest -v -s distributed/test_multimodal_broadcast.py
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
  - pip install -e ./plugins/vllm_add_dummy_model
  - pytest -v -s distributed/test_distributed_oot.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
 - label: Multi-step Tests (4 GPUs) # 21min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
  - vllm/model_executor/layers/sampler.py
  - vllm/sequence.py
  - vllm/worker/worker_base.py
  - vllm/worker/worker.py
  - vllm/worker/multi_step_worker.py
  - vllm/worker/model_runner_base.py
  - vllm/worker/model_runner.py
  - vllm/worker/multi_step_model_runner.py
  - vllm/engine
  - tests/multi_step
  commands:
  - pytest -v -s multi_step/test_correctness.py
 - label: Pipeline Parallelism Test # 23min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/model_executor/models/
  - tests/distributed/
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
  - pytest -v -s distributed/test_pipeline_parallel.py
 - label: LoRA Long Context (Distributed) # 11min
  # This test runs llama 13B, so it is required to run on 4 GPUs.
  num_gpus: 4
  source_file_dependencies:
  - vllm/lora
  - csrc/punica
  - tests/lora/test_long_context
  commands:
    # FIXIT: find out which code initialize cuda before running the test
    # before the fix, we need to use spawn to test it
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s -x lora/test_long_context.py
- label: Tensorizer Test
+- label: Weight Loading Multiple GPU Test
-  #mirror_hardwares: [amd]
+  working_dir: "/vllm-workspace/tests"
-  command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
+  num_gpus: 2
-
+  source_file_dependencies:
- label: Metrics Test
+  - vllm/
-  mirror_hardwares: [amd]
+  - tests/weight_loading
  command: pytest -v -s metrics
 - label: Quantization Test
  #mirror_hardwares: [amd]
  command: pytest -v -s quantization
 - label: Tracing Test
  commands:
-    - "pip install \
+    - bash weight_loading/run_model_weight_loading_test.sh
        opentelemetry-sdk \
        opentelemetry-api \
        opentelemetry-exporter-otlp \
        opentelemetry-semantic-conventions-ai"
    - pytest -v -s tracing
 - label: Benchmarks
  working_dir: "/vllm-workspace/.buildkite"
  mirror_hardwares: [amd]
  commands:
  - pip install aiohttp
  - bash run-benchmarks.sh
- label: LM Eval Small Models
+##### multi gpus test #####
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+##### A100 test #####
  commands:
  - pip install lm-eval
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - bash ./run-tests.sh -c configs/models-small.txt -t 1
- label: LM Eval Large Models
+- label: Distributed Tests (A100) # optional
  gpu: a100
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  commands:
  - pip install lm-eval
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - bash ./run-tests.sh -c configs/models-large.txt -t 4
 - label: Documentation Build
  working_dir: "/vllm-workspace/test_docs/docs"
  no_gpu: True
  commands:
  - pip install -r requirements-docs.txt
  - SPHINXOPTS=\"-W\" make html
 - label: Distributed Tests (A100)
  gpu: a100
  num_gpus: 4
  source_file_dependencies:
  - vllm/
  commands: 
  # NOTE: don't test llama model here, it seems hf implementation is buggy
  # see https://github.com/vllm-project/vllm/pull/5689 for details
  - pytest -v -s distributed/test_custom_all_reduce.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
  - pytest -v -s -x lora/test_mixtral.py
 - label: LM Eval Large Models # optional
  gpu: a100
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pip install lm-eval
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - bash ./run-tests.sh -c configs/models-large.txt -t 4
--- a/.dockerignore
+++ b/.dockerignore
@@ -1 +1,4 @@
 vllm/*.so
 /.venv
 /build
 dist
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -0,0 +1,2 @@
 github: [vllm-project]
 open_collective: [vllm]
--- a/.github/ISSUE_TEMPLATE/100-documentation.yml
+++ b/.github/ISSUE_TEMPLATE/100-documentation.yml
@@ -20,3 +20,10 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
    label: Before submitting a new issue...
    options:
      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
        required: true
--- a/.github/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@@ -38,3 +38,10 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
    label: Before submitting a new issue...
    options:
      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
        required: true
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@@ -36,3 +36,10 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
    label: Before submitting a new issue...
    options:
      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
        required: true
--- a/.github/ISSUE_TEMPLATE/400-bug
+++ b/.github/ISSUE_TEMPLATE/400-bug
@@ -20,9 +20,14 @@ body:
      ```
      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
    value: |
      <details>
      <summary>The output of `python collect_env.py`</summary>
      ```text
-      The output of `python collect_env.py`
+      Your output of `python collect_env.py` here
      ```
      </details>
  validations:
    required: true
 - type: textarea
@@ -84,3 +89,10 @@ body:
      - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
    label: Before submitting a new issue...
    options:
      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
        required: true
--- a/.github/ISSUE_TEMPLATE/500-feature
+++ b/.github/ISSUE_TEMPLATE/500-feature
@@ -29,3 +29,10 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
    label: Before submitting a new issue...
    options:
      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
        required: true
--- a/.github/ISSUE_TEMPLATE/600-new
+++ b/.github/ISSUE_TEMPLATE/600-new
@@ -31,3 +31,10 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
    label: Before submitting a new issue...
    options:
      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
        required: true
--- a/.github/ISSUE_TEMPLATE/700-performance
+++ b/.github/ISSUE_TEMPLATE/700-performance
@@ -50,3 +50,10 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
    label: Before submitting a new issue...
    options:
      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
        required: true
--- a/.github/ISSUE_TEMPLATE/750-RFC.yml
+++ b/.github/ISSUE_TEMPLATE/750-RFC.yml
@@ -47,3 +47,10 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
    label: Before submitting a new issue...
    options:
      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
        required: true
--- a/.github/ISSUE_TEMPLATE/800-misc
+++ b/.github/ISSUE_TEMPLATE/800-misc
@@ -19,3 +19,10 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
    label: Before submitting a new issue...
    options:
      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
        required: true
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@@ -0,0 +1,21 @@
 name: Add label on auto-merge enabled
 on:
    pull_request_target:
        types:
            - auto_merge_enabled
 jobs:
    add-label-on-auto-merge:
        runs-on: ubuntu-latest
        steps:
            -   name: Add label
                uses: actions/github-script@v5
                with:
                    script: |
                        github.rest.issues.addLabels({
                            owner: context.repo.owner,
                            repo: context.repo.repo,
                            issue_number: context.issue.number,
                            labels: ['ready']
                        })
                env:
                    GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/add_label_ready_comment.yml
+++ b/.github/workflows/add_label_ready_comment.yml
@@ -0,0 +1,23 @@
 name: Add Ready Label on Ready Comment
 on:
  issue_comment:
    types: [created]
 jobs:
  add-ready-label:
    runs-on: ubuntu-latest
    if: github.event.issue.pull_request && contains(github.event.comment.body, '/ready')
    steps:
        -   name: Add label
            uses: actions/github-script@v5
            with:
                script: |
                    github.rest.issues.addLabels({
                        owner: context.repo.owner,
                        repo: context.repo.repo,
                        issue_number: context.issue.number,
                        labels: ['ready']
                    })
            env:
                GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@@ -30,12 +30,11 @@ jobs:
      run: |
        EXCLUDES=(
            'csrc/moe/topk_softmax_kernels.cu'
-            'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
+            'csrc/quantization/gguf/ggml-common.h'
-            'csrc/punica/bgmv/bgmv_config.h'
+            'csrc/quantization/gguf/dequantize.cuh'
-            'csrc/punica/bgmv/bgmv_impl.cuh'
+            'csrc/quantization/gguf/vecdotq.cuh'
-            'csrc/punica/bgmv/vec_dtypes.cuh'
+            'csrc/quantization/gguf/mmq.cuh'
-            'csrc/punica/punica_ops.cu'
+            'csrc/quantization/gguf/mmvq.cuh'
            'csrc/punica/type_convert.h'
        )
        find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
            | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -15,7 +15,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
    steps:
    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
@@ -25,27 +25,23 @@ jobs:
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
-        pip install mypy==1.9.0
+        pip install mypy==1.11.1
        pip install types-setuptools
        pip install types-PyYAML
        pip install types-requests
        pip install types-setuptools
    - name: Mypy
      run: |
-        mypy vllm/attention --config-file pyproject.toml
+        mypy
-        mypy vllm/core --config-file pyproject.toml
+        mypy tests --follow-imports skip
-        mypy vllm/distributed --config-file pyproject.toml
+        mypy vllm/attention --follow-imports skip
-        mypy vllm/entrypoints --config-file pyproject.toml
+        mypy vllm/core --follow-imports skip
-        mypy vllm/executor --config-file pyproject.toml
+        mypy vllm/distributed --follow-imports skip
-        mypy vllm/multimodal --config-file pyproject.toml
+        mypy vllm/engine  --follow-imports skip
-        mypy vllm/usage --config-file pyproject.toml
+        mypy vllm/executor --follow-imports skip
-        mypy vllm/*.py --config-file pyproject.toml
+        mypy vllm/lora --follow-imports skip
-        mypy vllm/transformers_utils --config-file pyproject.toml
+        mypy vllm/model_executor  --follow-imports skip
-        mypy vllm/engine  --config-file pyproject.toml
+        mypy vllm/prompt_adapter --follow-imports skip
-        mypy vllm/worker --config-file pyproject.toml
+        mypy vllm/spec_decode --follow-imports skip
-        mypy vllm/spec_decode --config-file pyproject.toml
+        mypy vllm/worker --follow-imports skip
        mypy vllm/model_executor  --config-file pyproject.toml
        mypy vllm/lora --config-file pyproject.toml
        mypy vllm/logging --config-file pyproject.toml
        mypy tests --config-file pyproject.toml
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -48,8 +48,8 @@ jobs:
      fail-fast: false
      matrix:
          os: ['ubuntu-20.04']
-          python-version: ['3.8', '3.9', '3.10', '3.11']
+          python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
-          pytorch-version: ['2.3.0']  # Must be the most recent version that meets requirements-cuda.txt.
+          pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
          cuda-version: ['11.8', '12.1']
    steps:
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@@ -0,0 +1,21 @@
 name: PR Reminder Comment Bot
 on:
  pull_request_target:
    types: [opened]
 jobs:
  pr_reminder:
    runs-on: ubuntu-latest
    steps:
      - name: Remind to run full CI on PR
        uses: actions/github-script@v6
        with:
          script: |
            github.rest.issues.createComment({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: context.issue.number,
              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which consists a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of default ones by unblocking the steps in your `fast-check` build on Buildkite UI. \n\nOnce the PR is approved and ready to go, please make sure to run full CI as it is required to merge (or just use auto-merge).\n\n To run full CI, you can do one of these:\n- Comment `/ready` on the PR\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
            })
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/remove_label_not_ready_comment.yml
+++ b/.github/workflows/remove_label_not_ready_comment.yml
@@ -0,0 +1,23 @@
 name: Remove ready Label on notready Comment
 on:
  issue_comment:
    types: [created]
 jobs:
  add-ready-label:
    runs-on: ubuntu-latest
    if: github.event.issue.pull_request && contains(github.event.comment.body, '/notready')
    steps:
        -   name: Remove ready label
            uses: actions/github-script@v5
            with:
                script: |
                    github.rest.issues.removeLabel({
                        owner: context.repo.owner,
                        repo: context.repo.repo,
                        issue_number: context.issue.number,
                        name: 'ready'
                    })
            env:
                GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -15,7 +15,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
    steps:
    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@@ -13,8 +13,6 @@ $python_executable -m pip install -r requirements-cuda.txt
 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
 # Make sure punica is built for the release (for LoRA)
 export VLLM_INSTALL_PUNICA_KERNELS=1
 # Make sure release wheels are built for the following architectures
 export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
 # Build
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -14,7 +14,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
    steps:
    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
 # vllm commit id, generated by setup.py
 vllm/commit_id.py
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -84,6 +87,9 @@ target/
 profile_default/
 ipython_config.py
 # generated files
 **/generated/**
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
@@ -186,4 +192,4 @@ _build/
 hip_compat.h
 # Benchmark dataset
-*.json
+benchmarks/*.json
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -10,6 +10,7 @@ build:
 sphinx:
   configuration: docs/source/conf.py
   fail_on_warning: true
 # If using Sphinx, optionally build your docs in additional formats such as PDF
 formats:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.21)
+cmake_minimum_required(VERSION 3.26)
 project(vllm_extensions LANGUAGES CXX)
@@ -10,11 +10,14 @@ message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
 include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 # Suppress potential warnings about unused manually-specified variables
 set(ignoreMe "${VLLM_PYTHON_PATH}")
 #
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
+set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11" "3.12")
 # Supported NVIDIA architectures.
 set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
@@ -32,8 +35,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.4.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")
 #
 # Try to find python package with an executable that exactly matches
@@ -66,6 +69,39 @@ endif()
 #
 find_package(Torch REQUIRED)
 #
 # Add the `default` target which detects which extensions should be
 # built based on platform/architecture.  This is the same logic that
 # setup.py uses to select which extensions should be built and should
 # be kept in sync.
 #
 # The `default` target makes direct use of cmake easier since knowledge
 # of which extensions are supported has been factored in, e.g.
 #
 # mkdir build && cd build
 # cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
 # cmake --build . --target default
 #
 add_custom_target(default)
 message(STATUS "Enabling core extension.")
 # Define _core_C extension
 #  built for (almost) every target platform, (excludes TPU and Neuron)
 set(VLLM_EXT_SRC
  "csrc/core/torch_bindings.cpp")
 define_gpu_extension_target(
  _core_C
  DESTINATION vllm
  LANGUAGE CXX
  SOURCES ${VLLM_EXT_SRC}
  COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
  USE_SABI 3
  WITH_SOABI)
 add_dependencies(default _core_C)
 #
 # Forward the non-CUDA device extensions to external CMake scripts.
 #
@@ -74,7 +110,7 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
    if (VLLM_TARGET_DEVICE STREQUAL "cpu")
        include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
    else()
-        message(FATAL_ERROR "Unsupported vLLM target device: ${VLLM_TARGET_DEVICE}")
+        return()
    endif()
    return()
 endif()
@@ -101,7 +137,7 @@ elseif(HIP_FOUND)
  # ROCm 5.X and 6.X
  if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
-    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM} "
+    message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
      "expected for ROCm build, saw ${Torch_VERSION} instead.")
  endif()
 else()
@@ -132,7 +168,7 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
 endif()
 #
-# Define extension targets
+# Define other extension targets
 #
 #
@@ -151,16 +187,18 @@ set(VLLM_EXT_SRC
  "csrc/quantization/fp8/common.cu"
  "csrc/cuda_utils_kernels.cu"
  "csrc/moe_align_block_size_kernels.cu"
  "csrc/prepare_inputs/advance_step.cu"
  "csrc/torch_bindings.cpp")
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  include(FetchContent)
-  SET(CUTLASS_ENABLE_HEADERS_ONLY=ON)
+  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
  FetchContent_Declare(
        cutlass
        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        # CUTLASS 3.5.0
+        # CUTLASS 3.5.1
-        GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
+        GIT_TAG 06b21349bcf6ddf6a1686a47a137ad1446579db9 
        GIT_PROGRESS TRUE
  )
  FetchContent_MakeAvailable(cutlass)
@@ -169,8 +207,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    "csrc/quantization/awq/gemm_kernels.cu"
    "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
    "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
    "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
    "csrc/quantization/gptq_marlin/gptq_marlin.cu"
    "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
    "csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
    "csrc/quantization/gguf/gguf_kernel.cu"
    "csrc/quantization/fp8/fp8_marlin.cu"
    "csrc/custom_all_reduce.cu"
    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
@@ -189,6 +230,51 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
          "-gencode arch=compute_90a,code=sm_90a")
  endif()
  #
  # Machete kernels
  # The machete kernels only work on hopper and require CUDA 12.0 or later.
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
    #
    # For the Machete kernels we automatically generate sources for various 
    # preselected input type pairs and schedules.
    # Generate sources:
    execute_process(
      COMMAND ${CMAKE_COMMAND} -E env 
      PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH 
        ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py
      RESULT_VARIABLE machete_generation_result
      OUTPUT_VARIABLE machete_generation_output
      OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
      ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
    )
    if (NOT machete_generation_result EQUAL 0)
      message(FATAL_ERROR "Machete generation failed."
                          " Result: \"${machete_generation_result}\"" 
                          "\nCheck the log for details: "
                          "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
    else()
      message(STATUS "Machete generation completed successfully.")
    endif()
    # Add machete generated sources
    file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
    list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
    message(STATUS "Machete generated sources: ${MACHETE_GEN_SOURCES}")
    set_source_files_properties(
          ${MACHETE_GEN_SOURCES}
          PROPERTIES
          COMPILE_FLAGS
          "-gencode arch=compute_90a,code=sm_90a")
  endif()
  # Add pytorch binding for machete (add on even CUDA < 12.0 so that we can
  #  raise an error if the user that this was built with an incompatible 
  #  CUDA version)
  list(APPEND VLLM_EXT_SRC
    csrc/quantization/machete/machete_pytorch.cu)
 endif()
 define_gpu_extension_target(
@@ -198,7 +284,7 @@ define_gpu_extension_target(
  SOURCES ${VLLM_EXT_SRC}
  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
  ARCHITECTURES ${VLLM_GPU_ARCHES}
-  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
+  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
  USE_SABI 3
  WITH_SOABI)
@@ -220,76 +306,7 @@ define_gpu_extension_target(
  USE_SABI 3
  WITH_SOABI)
 #
 # _punica_C extension
 #
 set(VLLM_PUNICA_EXT_SRC
  "csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
  "csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
  "csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
  "csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
  "csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
  "csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
  "csrc/punica/punica_ops.cu"
  "csrc/punica/torch_bindings.cpp")
 #
 # Copy GPU compilation flags+update for punica
 #
 set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS})
 list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
  "-D__CUDA_NO_HALF_OPERATORS__"
  "-D__CUDA_NO_HALF_CONVERSIONS__"
  "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
  "-D__CUDA_NO_HALF2_OPERATORS__")
 #
 # Filter out CUDA architectures < 8.0 for punica.
 #
 if (${VLLM_GPU_LANG} STREQUAL "CUDA")
  set(VLLM_PUNICA_GPU_ARCHES)
  foreach(ARCH ${VLLM_GPU_ARCHES})
    string_to_ver(CODE_VER ${ARCH})
    if (CODE_VER GREATER_EQUAL 8.0)
      list(APPEND VLLM_PUNICA_GPU_ARCHES ${ARCH})
    endif()
  endforeach()
  message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
 elseif(${VLLM_GPU_LANG} STREQUAL "HIP")
  set(VLLM_PUNICA_GPU_ARCHES ${VLLM_GPU_ARCHES})
  message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
 endif()
 if (VLLM_PUNICA_GPU_ARCHES)
  define_gpu_extension_target(
    _punica_C
    DESTINATION vllm
    LANGUAGE ${VLLM_GPU_LANG}
    SOURCES ${VLLM_PUNICA_EXT_SRC}
    COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
    ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
    USE_SABI 3
    WITH_SOABI)
 else()
  message(WARNING "Unable to create _punica_C target because none of the "
    "requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")
 endif()
 #
 # Add the `default` target which detects which extensions should be
 # built based on platform/architecture.  This is the same logic that
 # setup.py uses to select which extensions should be built and should
 # be kept in sync.
 #
 # The `default` target makes direct use of cmake easier since knowledge
 # of which extensions are supported has been factored in, e.g.
 #
 # mkdir build && cd build
 # cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
 # cmake --build . --target default
 #
 add_custom_target(default)
 if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
  message(STATUS "Enabling C extension.")
@@ -298,12 +315,4 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
  message(STATUS "Enabling moe extension.")
  add_dependencies(default _moe_C)
  # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
  # VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
  # there are supported target arches.
  if (VLLM_PUNICA_GPU_ARCHES AND
      (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS))
    message(STATUS "Enabling punica extension.")
    add_dependencies(default _punica_C)
  endif()
 endif()
--- a/72
+++ b/72
@@ -8,26 +8,24 @@
 ARG CUDA_VERSION=12.4.1
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS base
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
 ARG CUDA_VERSION=12.4.1
-ARG PYTHON_VERSION=3
+ARG PYTHON_VERSION=3.10
 ENV DEBIAN_FRONTEND=noninteractive
 # Install Python and other dependencies
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common \
+    && apt-get install -y ccache software-properties-common git curl sudo \
    && add-apt-repository ppa:deadsnakes/ppa \
    && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv python3-pip \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
-    && if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
-    && python3 --version \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-    && python3 -m pip --version
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
-RUN apt-get update -y \
+    && python3 --version && python3 -m pip --version
    && apt-get install -y python3-pip git curl sudo
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@@ -39,6 +37,7 @@ WORKDIR /workspace
 # install build and runtime dependencies
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-adag.txt requirements-adag.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -r requirements-cuda.txt
@@ -58,23 +57,19 @@ ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 #################### WHEEL BUILD IMAGE ####################
 FROM base AS build
 ARG PYTHON_VERSION=3
 # install build dependencies
 COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -r requirements-build.txt
 # install compiler cache to speed up compilation leveraging local or remote caching
 RUN apt-get update -y && apt-get install -y ccache
 # files and directories related to build wheels
 COPY csrc csrc
 COPY setup.py setup.py
 COPY cmake cmake
 COPY CMakeLists.txt CMakeLists.txt
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-adag.txt requirements-adag.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 COPY pyproject.toml pyproject.toml
 COPY vllm vllm
@@ -85,10 +80,13 @@ ENV MAX_JOBS=${max_jobs}
 # number of threads used by nvcc
 ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads
-# make sure punica kernels are built (for LoRA)
+
-ENV VLLM_INSTALL_PUNICA_KERNELS=1
+ARG buildkite_commit
 ENV BUILDKITE_COMMIT=${buildkite_commit}
 ARG USE_SCCACHE
 ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
 ARG SCCACHE_REGION_NAME=us-west-2
 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/pip \
    if [ "$USE_SCCACHE" = "1" ]; then \
@@ -97,10 +95,12 @@ RUN --mount=type=cache,target=/root/.cache/pip \
        && tar -xzf sccache.tar.gz \
        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
-        && export SCCACHE_BUCKET=vllm-build-sccache \
+        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
-        && export SCCACHE_REGION=us-west-2 \
+        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
        && export SCCACHE_IDLE_TIMEOUT=0 \
        && export CMAKE_BUILD_TYPE=Release \
        && sccache --show-stats \
-        && python3 setup.py bdist_wheel --dist-dir=dist \
+        && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
        && sccache --show-stats; \
    fi
@@ -108,7 +108,7 @@ ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/pip \
    if [ "$USE_SCCACHE" != "1" ]; then \
-        python3 setup.py bdist_wheel --dist-dir=dist; \
+        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
    fi
 # check the size of the wheel, we cannot upload wheels larger than 100MB
@@ -145,12 +145,28 @@ RUN pip --verbose wheel -r requirements-mamba.txt \
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
-FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
+FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
 ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.10
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update -y \
+RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
-    && apt-get install -y python3-pip git vim
+    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
 # Install Python and other dependencies
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
    && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
    && add-apt-repository ppa:deadsnakes/ppa \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@@ -166,6 +182,10 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamba \
    --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir
 RUN --mount=type=cache,target=/root/.cache/pip \
    . /etc/environment && \
    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.4/flashinfer-0.1.4+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
 #################### vLLM installation IMAGE ####################
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -2,36 +2,49 @@
 FROM ubuntu:22.04 AS cpu-test-1
-RUN apt-get update  -y \
+RUN --mount=type=cache,target=/var/cache/apt \
-    && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \
+    apt-get update -y \
    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 # https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
 # intel-openmp provides additional performance improvement vs. openmp
 # tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
-RUN pip install intel-openmp
+RUN --mount=type=cache,target=/root/.cache/pip \
    pip install intel-openmp
-ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD"
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
 RUN echo 'ulimit -c 0' >> ~/.bashrc
-RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl
+RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl
-RUN pip install --upgrade pip \
+ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
-    && pip install wheel packaging ninja "setuptools>=49.4.0" numpy
+RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
    pip install --upgrade pip && \
    pip install -r requirements-build.txt
 FROM cpu-test-1 AS build
 COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm
-RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
    pip install -v -r requirements-cpu.txt
 COPY ./ ./
 # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
 ARG VLLM_CPU_DISABLE_AVX512
 ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
-RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
+ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=cache,target=/root/.cache/ccache \
    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
    pip install dist/*.whl
 WORKDIR /workspace/
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -1,5 +1,5 @@
 # default base image
-ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"
+ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.19.1-ubuntu20.04"
 FROM $BASE_IMAGE
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -13,12 +13,15 @@ COPY requirements-common.txt /workspace/vllm/
 COPY requirements-openvino.txt /workspace/vllm/
 COPY vllm/ /workspace/vllm/vllm
 COPY csrc/core /workspace/vllm/csrc/core
 COPY cmake/utils.cmake /workspace/vllm/cmake/
 COPY CMakeLists.txt /workspace/vllm/
 COPY setup.py /workspace/vllm/
 # install build requirements
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
 # build vLLM with OpenVINO backend
-RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
 COPY examples/ /workspace/vllm/examples
 COPY benchmarks/ /workspace/vllm/benchmarks
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -1,26 +1,24 @@
 # Default ROCm 6.1 base image
 ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
 # Tested and supported base rocm/pytorch images
 ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1" \
    ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \
    ROCM_6_1_BASE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
 # Default ROCm ARCHes to build vLLM for.
 ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
-# Whether to build CK-based flash-attention
+# Whether to install CK-based flash-attention
-# If 0, will not build flash attention
+# If 0, will not install flash-attention
 # This is useful for gfx target where flash-attention is not supported
 # (i.e. those that do not appear in `FA_GFX_ARCHS`)
 # Triton FA is used by default on ROCm now so this is unnecessary.
 ARG BUILD_FA="1"
 # If `TRY_FA_WHEEL=1`, we will try installing flash-attention from `FA_WHEEL_URL`
 # If this succeeds, we use the downloaded wheel and skip building flash-attention.
 # Otherwise, ROCm flash-attention from `FA_BRANCH` will be built for the
 # architectures specified in `FA_GFX_ARCHS`
 ARG TRY_FA_WHEEL="1"
 ARG FA_WHEEL_URL="https://github.com/ROCm/flash-attention/releases/download/v2.5.9post1-cktile-vllm/flash_attn-2.5.9.post1-cp39-cp39-linux_x86_64.whl"
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
-ARG FA_BRANCH="ae7928c"
+ARG FA_BRANCH="23a2b1c2"
 # Whether to build triton on rocm
 ARG BUILD_TRITON="1"
-ARG TRITON_BRANCH="0ef1848"
+ARG TRITON_BRANCH="e0fc12c"
 ### Base image build stage
 FROM $BASE_IMAGE AS base
@@ -48,29 +46,17 @@ RUN apt-get update && apt-get install -y \
 ARG APP_MOUNT=/vllm-workspace
 WORKDIR ${APP_MOUNT}
-RUN pip install --upgrade pip
+RUN python3 -m pip install --upgrade pip
 # Remove sccache so it doesn't interfere with ccache
 # TODO: implement sccache support across components
-RUN apt-get purge -y sccache; pip uninstall -y sccache; rm -f "$(which sccache)"
+RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
-# Install torch == 2.4.0 on ROCm
+# Install torch == 2.5.0 on ROCm
 RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
        *"rocm-5.7"*) \
            pip uninstall -y torch torchaudio torchvision \
            && pip install --no-cache-dir --pre \
                torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
                torchvision==0.19.0.dev20240612 \
               --index-url https://download.pytorch.org/whl/nightly/rocm5.7;; \
        *"rocm-6.0"*) \
            pip uninstall -y torch torchaudio torchvision \
            && pip install --no-cache-dir --pre \
                torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
                torchvision==0.19.0.dev20240612 \
               --index-url https://download.pytorch.org/whl/nightly/rocm6.0;; \
        *"rocm-6.1"*) \
-            pip uninstall -y torch torchaudio torchvision \
+            python3 -m pip uninstall -y torch torchvision \
-            && pip install --no-cache-dir --pre \
+            && python3 -m pip install --no-cache-dir --pre \
-                torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
+                torch==2.5.0.dev20240726 \
-                torchvision==0.19.0.dev20240612 \
+                torchvision==0.20.0.dev20240726 \
               --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
        *) ;; esac
@@ -87,29 +73,31 @@ ENV CCACHE_DIR=/root/.cache/ccache
 FROM base AS build_amdsmi
 # Build amdsmi wheel always
 RUN cd /opt/rocm/share/amd_smi \
-    && pip wheel . --wheel-dir=/install
+    && python3 -m pip wheel . --wheel-dir=/install
 ### Flash-Attention wheel build stage
 FROM base AS build_fa
 ARG BUILD_FA
 ARG TRY_FA_WHEEL
 ARG FA_WHEEL_URL
 ARG FA_GFX_ARCHS
 ARG FA_BRANCH
 # Build ROCm flash-attention wheel if `BUILD_FA = 1`
 RUN --mount=type=cache,target=${CCACHE_DIR} \
    if [ "$BUILD_FA" = "1" ]; then \
-    mkdir -p libs \
+        if [ "${TRY_FA_WHEEL}" = "1" ] && python3 -m pip install "${FA_WHEEL_URL}"; then \
-    && cd libs \
+            # If a suitable wheel exists, we download it instead of building FA
-    && git clone https://github.com/ROCm/flash-attention.git \
+            mkdir -p /install && wget -N "${FA_WHEEL_URL}" -P /install; \
-    && cd flash-attention \
+        else \
-    && git checkout "${FA_BRANCH}" \
+            mkdir -p libs \
-    && git submodule update --init \
+            && cd libs \
-    && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
+            && git clone https://github.com/ROCm/flash-attention.git \
-        *"rocm-5.7"*) \
+            && cd flash-attention \
-            export VLLM_TORCH_PATH="$(python3 -c 'import torch; print(torch.__path__[0])')" \
+            && git checkout "${FA_BRANCH}" \
-            && patch "${VLLM_TORCH_PATH}"/utils/hipify/hipify_python.py hipify_patch.patch;; \
+            && git submodule update --init \
-        *) ;; esac \
+            && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
-    && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
+        fi; \
    # Create an empty directory otherwise as later build stages expect one
    else mkdir -p /install; \
    fi
@@ -139,19 +127,11 @@ FROM base AS final
 # Import the vLLM development directory from the build context
 COPY . .
 # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
 # Manually remove it so that later steps of numpy upgrade can continue
 RUN case "$(which python3)" in \
        *"/opt/conda/envs/py_3.9"*) \
            rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
        *) ;; esac
 # Package upgrades for useful functionality or to avoid dependency issues
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install --upgrade numba scipy huggingface-hub[cli]
+    python3 -m pip install --upgrade numba scipy huggingface-hub[cli]
 # Make sure punica kernels are built (for LoRA)
 ENV VLLM_INSTALL_PUNICA_KERNELS=1
 # Workaround for ray >= 2.10.0
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
 # Silences the HF Tokenizers warning
@@ -159,14 +139,11 @@ ENV TOKENIZERS_PARALLELISM=false
 RUN --mount=type=cache,target=${CCACHE_DIR} \
    --mount=type=cache,target=/root/.cache/pip \
-    pip install -U -r requirements-rocm.txt \
+    python3 -m pip install -Ur requirements-rocm.txt \
    && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
        *"rocm-6.0"*) \
            patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h rocm_patch/rocm_bf16.patch;; \
        *"rocm-6.1"*) \
            # Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
-            wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P rocm_patch \
+            wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib \
            && cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6 \
            # Prevent interference if torch bundles its own HIP runtime
            && rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
        *) ;; esac \
@@ -178,7 +155,7 @@ RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
    mkdir -p libs \
    && cp /install/*.whl libs \
    # Preemptively uninstall to avoid same-version no-installs
-    && pip uninstall -y amdsmi;
+    && python3 -m pip uninstall -y amdsmi;
 # Copy triton wheel(s) into final image if they were built
 RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
@@ -186,7 +163,7 @@ RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
    && if ls /install/*.whl; then \
        cp /install/*.whl libs \
        # Preemptively uninstall to avoid same-version no-installs
-        && pip uninstall -y triton; fi
+        && python3 -m pip uninstall -y triton; fi
 # Copy flash-attn wheel(s) into final image if they were built
 RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
@@ -194,11 +171,11 @@ RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
    && if ls /install/*.whl; then \
        cp /install/*.whl libs \
        # Preemptively uninstall to avoid same-version no-installs
-        && pip uninstall -y flash-attn; fi
+        && python3 -m pip uninstall -y flash-attn; fi
 # Install wheels that were built to the final image
 RUN --mount=type=cache,target=/root/.cache/pip \
    if ls libs/*.whl; then \
-    pip install libs/*.whl; fi
+    python3 -m pip install libs/*.whl; fi
 CMD ["/bin/bash"]
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -1,19 +1,17 @@
-ARG NIGHTLY_DATE="20240601"
+ARG NIGHTLY_DATE="20240808"
 ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
 FROM $BASE_IMAGE
 WORKDIR /workspace
 COPY . /workspace/vllm
 ENV VLLM_TARGET_DEVICE="tpu"
 # Install aiohttp separately to avoid build errors.
 RUN pip install aiohttp
 # Install the TPU and Pallas dependencies.
-RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
+RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
-RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
 # Build vLLM.
-RUN cd /workspace/vllm && python setup.py develop
+COPY . /workspace/vllm
 ENV VLLM_TARGET_DEVICE="tpu"
 RUN cd /workspace/vllm && python3 -m pip install -r requirements-tpu.txt
 RUN cd /workspace/vllm && python3 setup.py develop
 CMD ["/bin/bash"]
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -1,4 +1,4 @@
-FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04
+FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,5 @@
 include LICENSE
 include requirements-adag.txt
 include requirements-common.txt
 include requirements-cuda.txt
 include requirements-rocm.txt
--- a/README.md
+++ b/README.md
@@ -10,33 +10,29 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>
 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> |
 </p>
 ---
-**Ray Summit CPF is Open (June 4th to June 20th)!**
+**vLLM & NVIDIA Triton User Meetup (Monday, September 9, 5pm-9pm PT) at Fort Mason, San Francisco**
-There will be a track for vLLM at the Ray Summit (09/30-10/02, SF) this year!
+We are excited to announce our sixth vLLM Meetup, in collaboration with NVIDIA Triton Team.
-If you have cool projects related to vLLM or LLM inference, we would love to see your proposals.
+Join us to hear the vLLM's recent update about performance.
-This will be a great chance for everyone in the community to get together and learn.
+Register now [here](https://lu.ma/87q3nvnh) and be part of the event!
 Please submit your proposal [here](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/eventsite)
 ---
 *Latest News* 🔥
 - [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
 - [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
 - [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
 - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
+- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
- [2024/01] Added ROCm 6.0 support to vLLM.
+- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
 - [2023/12] Added ROCm 5.7 support to vLLM.
 - [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
 - [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
 - [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
 - [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
 - [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
 - [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
 - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
 ---
@@ -49,30 +45,35 @@ vLLM is fast with:
 - Efficient management of attention key and value memory with **PagedAttention**
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
+- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
- Optimized CUDA kernels
+- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
 - Speculative decoding
 - Chunked prefill
 **Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
 vLLM is flexible and easy to use with:
 - Seamless integration with popular Hugging Face models
 - High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
- Tensor parallelism support for distributed inference
+- Tensor parallelism and pipeline parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
- Support NVIDIA GPUs, AMD GPUs, Intel CPUs and GPUs
+- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
- (Experimental) Prefix caching support
+- Prefix caching support
- (Experimental) Multi-lora support
+- Multi-lora support
 vLLM seamlessly supports most popular open-source models on HuggingFace, including:
 - Transformer-like LLMs (e.g., Llama)
 - Mixture-of-Expert LLMs (e.g., Mixtral)
 - Embedding Models (e.g. E5-Mistral)
 - Multi-modal LLMs (e.g., LLaVA)
 Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
 ## Getting Started
-Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
+Install vLLM with `pip` or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
 ```bash
 pip install vllm
@@ -103,12 +104,14 @@ vLLM is a community project. Our compute resources for development and testing a
 - Databricks
 - DeepInfra
 - Dropbox
 - Google Cloud
 - Lambda Lab
 - NVIDIA
 - Replicate
 - Roblox
 - RunPod
 - Sequoia Capital
 - Skywork AI
 - Trainy
 - UC Berkeley
 - UC San Diego
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -225,8 +225,8 @@ async def async_request_openai_completions(
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(
-        "completions"
+        ("completions", "profile")
-    ), "OpenAI Completions API URL must end with 'completions'."
+    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        assert not request_func_input.use_beam_search
@@ -276,8 +276,9 @@ async def async_request_openai_completions(
                                    output.ttft = ttft
                                # Decoding phase
-                                output.itl.append(timestamp -
+                                else:
-                                                  most_recent_timestamp)
+                                    output.itl.append(timestamp -
                                                      most_recent_timestamp)
                                most_recent_timestamp = timestamp
                                generated_text += data["choices"][0]["text"]
@@ -390,17 +391,17 @@ def remove_prefix(text: str, prefix: str) -> str:
    return text
-def get_model(pretrained_model_name_or_path: str):
+def get_model(pretrained_model_name_or_path: str) -> str:
    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
        from modelscope import snapshot_download
    else:
        from huggingface_hub import snapshot_download
-    model_path = snapshot_download(
+        model_path = snapshot_download(
-        model_id=pretrained_model_name_or_path,
+            model_id=pretrained_model_name_or_path,
-        local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
-        ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
+            ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
-    return model_path
+
        return model_path
    return pretrained_model_name_or_path
 def get_tokenizer(
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -11,7 +11,7 @@ from tqdm import tqdm
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.inputs import PromptStrictInputs
+from vllm.inputs import PromptInputs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.utils import FlexibleArgumentParser
@@ -61,7 +61,7 @@ def main(args: argparse.Namespace):
    dummy_prompt_token_ids = np.random.randint(10000,
                                               size=(args.batch_size,
                                                     args.input_len))
-    dummy_inputs: List[PromptStrictInputs] = [{
+    dummy_inputs: List[PromptInputs] = [{
        "prompt_token_ids": batch
    } for batch in dummy_prompt_token_ids.tolist()]
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -1,8 +1,45 @@
 """
 Benchmark the efficiency of prefix caching.
 This script allows you to benchmark the performance of
 a model with and without prefix caching using either fixed prompts
 or prompts sampled from the ShareGPT dataset.
 Fixed example usage:
    python benchmark_prefix_caching.py \
        --model meta-llama/Llama-2-7b-chat-hf \
        --enable-prefix-caching \
        --num-prompts 1 \
        --repeat-count 100
 ShareGPT example usage:
    # This command samples 20 prompts with input lengths
    # between 128 and 256 tokens from the ShareGPT dataset,
    # then replicates each prompt 5 times.
    python benchmark_prefix_caching.py \
        --model meta-llama/Llama-2-7b-chat-hf \
        --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json \
        --enable-prefix-caching \
        --num-prompts 20 \
        --repeat-count 5 \
        --input-length-range 128:256
 """
 import json
 import random
 import time
 from typing import List, Optional, Tuple
 from transformers import PreTrainedTokenizerBase
 from vllm import LLM, SamplingParams
 from vllm.utils import FlexibleArgumentParser
 try:
    from vllm.transformers_utils.tokenizer import get_tokenizer
 except ImportError:
    from backend_request_func import get_tokenizer
 PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n"  # noqa: E501
@@ -15,7 +52,83 @@ def test_prefix(llm=None, sampling_params=None, prompts=None):
    print(f"cost time {end_time - start_time}")
 def sample_requests(
    dataset_path: str,
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
    input_length_range: Tuple[int, int],
    fixed_output_len: Optional[int],
 ) -> List[Tuple[str, int, int]]:
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")
    # Load the dataset.
    with open(dataset_path) as f:
        dataset = json.load(f)
    # Filter out the conversations with less than 2 turns.
    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
    # Only keep the first two turns of each conversation.
    dataset = [(data["conversations"][0]["value"],
                data["conversations"][1]["value"]) for data in dataset]
    # Shuffle the dataset.
    random.shuffle(dataset)
    min_len, max_len = input_length_range
    # Filter out sequences that are too long or too short
    filtered_dataset: List[Tuple[str, int, int]] = []
    for i in range(len(dataset)):
        if len(filtered_dataset) == num_requests:
            break
        # Tokenize the prompts and completions.
        prompt = dataset[i][0]
        prompt_token_ids = tokenizer(prompt).input_ids
        completion = dataset[i][1]
        completion_token_ids = tokenizer(completion).input_ids
        prompt_len = len(prompt_token_ids)
        output_len = len(completion_token_ids
                         ) if fixed_output_len is None else fixed_output_len
        if prompt_len < 4 or output_len < 4:
            # Prune too short sequences.
            continue
        if min_len <= prompt_len <= max_len:
            filtered_dataset.append((prompt, prompt_len, output_len))
    return filtered_dataset
 def repeat_and_sort_requests(requests: List[Tuple[str, int, int]],
                             repeat_count: int,
                             sort: bool = False) -> List[str]:
    repeated_requests = requests * repeat_count
    if sort:
        repeated_requests.sort(key=lambda x: x[1])
    else:
        random.shuffle(repeated_requests)
    return [req[0] for req in repeated_requests]
 def main(args):
    tokenizer = get_tokenizer(args.model, trust_remote_code=True)
    input_length_range = tuple(map(int, args.input_length_range.split(':')))
    if args.dataset_path is not None:
        print(f"Start to sample {args.num_prompts} prompts"
              "from {args.dataset_path}")
        filtered_datasets = sample_requests(
            dataset_path=args.dataset_path,
            num_requests=args.num_prompts,
            tokenizer=tokenizer,
            input_length_range=input_length_range,
            fixed_output_len=args.output_len,
        )
    else:
        prompt_len = len(tokenizer(PROMPT).input_ids)
        filtered_datasets = [(PROMPT, prompt_len, args.output_len)
                             ] * args.num_prompts
    llm = LLM(model=args.model,
              tokenizer_mode='auto',
              trust_remote_code=True,
@@ -24,10 +137,13 @@ def main(args):
              tensor_parallel_size=args.tensor_parallel_size,
              enable_prefix_caching=args.enable_prefix_caching)
    num_prompts = 100
    prompts = [PROMPT] * num_prompts
    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
    print("Testing filtered datasets")
    prompts = repeat_and_sort_requests(filtered_datasets,
                                       repeat_count=args.repeat_count,
                                       sort=args.sort)
    print("------warm up------")
    test_prefix(
        llm=llm,
@@ -45,11 +161,15 @@ def main(args):
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
-        description='Benchmark the performance with or without automatic '
+        description=
-        'prefix caching.')
+        'Benchmark the performance with or without automatic prefix caching.')
    parser.add_argument('--model',
                        type=str,
                        default='baichuan-inc/Baichuan2-13B-Chat')
    parser.add_argument("--dataset-path",
                        type=str,
                        default=None,
                        help="Path to the dataset.")
    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
    parser.add_argument('--output-len', type=int, default=10)
    parser.add_argument('--enable-prefix-caching',
@@ -58,5 +178,21 @@ if __name__ == "__main__":
    parser.add_argument('--use-v2-block-manager',
                        action='store_true',
                        help='Use BlockSpaceMangerV2')
    parser.add_argument('--num-prompts',
                        type=int,
                        default=1,
                        help="Number of the prompts sampled from dataset")
    parser.add_argument('--repeat-count',
                        type=int,
                        default=100,
                        help='Number of times to repeat each prompt')
    parser.add_argument('--sort',
                        action='store_true',
                        help='Sort prompts by input length')
    parser.add_argument('--input-length-range',
                        type=str,
                        default='128:256',
                        help='Range of input lengths for sampling prompts,'
                        'specified as "min:max" (e.g., "128:256").')
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -2,8 +2,8 @@
 On the server side, run one of the following commands:
    vLLM OpenAI API server
-    python -m vllm.entrypoints.openai.api_server \
+    vllm serve <your_model> \
-        --model <your_model> --swap-space 16 \
+        --swap-space 16 \
        --disable-log-requests
    (TGI backend)
@@ -60,12 +60,15 @@ class BenchmarkMetrics:
    output_throughput: float
    mean_ttft_ms: float
    median_ttft_ms: float
    std_ttft_ms: float
    p99_ttft_ms: float
    mean_tpot_ms: float
    median_tpot_ms: float
    std_tpot_ms: float
    p99_tpot_ms: float
    mean_itl_ms: float
    median_itl_ms: float
    std_itl_ms: float
    p99_itl_ms: float
@@ -77,7 +80,6 @@ def sample_sharegpt_requests(
 ) -> List[Tuple[str, int, int]]:
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")
    # Load the dataset.
    with open(dataset_path) as f:
        dataset = json.load(f)
@@ -185,6 +187,31 @@ def sample_sonnet_requests(
    return sampled_requests
 def sample_random_requests(
        input_len: int, output_len: int, num_prompts: int, range_ratio: float,
        tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:
    input_lens = np.random.randint(
        int(input_len * range_ratio),
        input_len + 1,
        size=num_prompts,
    )
    output_lens = np.random.randint(
        int(output_len * range_ratio),
        output_len + 1,
        size=num_prompts,
    )
    offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
    input_requests = []
    for i in range(num_prompts):
        prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
                                   for j in range(input_lens[i])])
        input_requests.append(
            (prompt, int(input_lens[i]), int(output_lens[i])))
    return input_requests
 async def get_request(
    input_requests: List[Tuple[str, int, int]],
    request_rate: float,
@@ -196,6 +223,7 @@ async def get_request(
        if request_rate == float("inf"):
            # If the request rate is infinity, then we don't need to wait.
            continue
        # Sample the request interval from the exponential distribution.
        interval = np.random.exponential(1.0 / request_rate)
        # The next request will be sent after the interval.
@@ -219,7 +247,7 @@ def calculate_metrics(
            # We use the tokenizer to count the number of output tokens for all
            # serving backends instead of looking at len(outputs[i].itl) since
            # multiple output tokens may be bundled together
-            # Note: this may inflate the output token count slightly
+            # Note : this may inflate the output token count slightly
            output_len = len(
                tokenizer(outputs[i].generated_text,
                          add_special_tokens=False).input_ids)
@@ -249,12 +277,15 @@ def calculate_metrics(
        mean_ttft_ms=np.mean(ttfts or 0) *
        1000,  # ttfts is empty if streaming is not supported by backend
        median_ttft_ms=np.median(ttfts or 0) * 1000,
        std_ttft_ms=np.std(ttfts or 0) * 1000,
        p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
        mean_tpot_ms=np.mean(tpots or 0) * 1000,
        median_tpot_ms=np.median(tpots or 0) * 1000,
        std_tpot_ms=np.std(tpots or 0) * 1000,
        p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
        mean_itl_ms=np.mean(itls or 0) * 1000,
        median_itl_ms=np.median(itls or 0) * 1000,
        std_itl_ms=np.std(itls or 0) * 1000,
        p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
    )
@@ -264,6 +295,7 @@ def calculate_metrics(
 async def benchmark(
    backend: str,
    api_url: str,
    base_url: str,
    model_id: str,
    tokenizer: PreTrainedTokenizerBase,
    input_requests: List[Tuple[str, int, int]],
@@ -271,6 +303,7 @@ async def benchmark(
    use_beam_search: bool,
    request_rate: float,
    disable_tqdm: bool,
    profile: bool,
 ):
    if backend in ASYNC_REQUEST_FUNCS:
        request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -295,6 +328,22 @@ async def benchmark(
            f"are correctly specified. Error: {test_output.error}")
    else:
        print("Initial test run completed. Starting main benchmark run...")
    if profile:
        print("Starting profiler...")
        profile_input = RequestFuncInput(
            model=model_id,
            prompt=test_prompt,
            api_url=base_url + "/start_profile",
            prompt_len=test_prompt_len,
            output_len=test_output_len,
            best_of=best_of,
            use_beam_search=use_beam_search,
        )
        profile_output = await request_func(request_func_input=profile_input)
        if profile_output.success:
            print("Profiler started")
    print(f"Traffic request rate: {request_rate}")
    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
@@ -318,6 +367,21 @@ async def benchmark(
                             pbar=pbar)))
    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
    if profile:
        print("Stopping profiler...")
        profile_input = RequestFuncInput(
            model=model_id,
            prompt=test_prompt,
            api_url=base_url + "/stop_profile",
            prompt_len=test_prompt_len,
            output_len=test_output_len,
            best_of=best_of,
            use_beam_search=use_beam_search,
        )
        profile_output = await request_func(request_func_input=profile_input)
        if profile_output.success:
            print("Profiler stopped")
    if pbar is not None:
        pbar.close()
@@ -371,12 +435,15 @@ async def benchmark(
        "output_throughput": metrics.output_throughput,
        "mean_ttft_ms": metrics.mean_ttft_ms,
        "median_ttft_ms": metrics.median_ttft_ms,
        "std_ttft_ms": metrics.std_ttft_ms,
        "p99_ttft_ms": metrics.p99_ttft_ms,
        "mean_tpot_ms": metrics.mean_tpot_ms,
        "median_tpot_ms": metrics.median_tpot_ms,
        "std_tpot_ms": metrics.std_tpot_ms,
        "p99_tpot_ms": metrics.p99_tpot_ms,
        "mean_itl_ms": metrics.mean_itl_ms,
        "median_itl_ms": metrics.median_itl_ms,
        "std_itl_ms": metrics.std_itl_ms,
        "p99_itl_ms": metrics.p99_itl_ms,
        "input_lens": [output.prompt_len for output in outputs],
        "output_lens": actual_output_lens,
@@ -399,8 +466,10 @@ def main(args: argparse.Namespace):
    if args.base_url is not None:
        api_url = f"{args.base_url}{args.endpoint}"
        base_url = f"{args.base_url}"
    else:
        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
        base_url = f"http://{args.host}:{args.port}"
    tokenizer = get_tokenizer(tokenizer_id,
                              trust_remote_code=args.trust_remote_code)
@@ -456,6 +525,15 @@ def main(args: argparse.Namespace):
                              for prompt, prompt_formatted, prompt_len,
                              output_len in input_requests]
    elif args.dataset_name == "random":
        input_requests = sample_random_requests(
            input_len=args.random_input_len,
            output_len=args.random_output_len,
            num_prompts=args.num_prompts,
            range_ratio=args.random_range_ratio,
            tokenizer=tokenizer,
        )
    else:
        raise ValueError(f"Unknown dataset: {args.dataset_name}")
@@ -463,6 +541,7 @@ def main(args: argparse.Namespace):
        benchmark(
            backend=backend,
            api_url=api_url,
            base_url=base_url,
            model_id=model_id,
            tokenizer=tokenizer,
            input_requests=input_requests,
@@ -470,6 +549,7 @@ def main(args: argparse.Namespace):
            use_beam_search=args.use_beam_search,
            request_rate=args.request_rate,
            disable_tqdm=args.disable_tqdm,
            profile=args.profile,
        ))
    # Save config and results to json
@@ -549,7 +629,7 @@ if __name__ == "__main__":
        "--dataset-name",
        type=str,
        default="sharegpt",
-        choices=["sharegpt", "sonnet"],
+        choices=["sharegpt", "sonnet", "random"],
        help="Name of the dataset to benchmark on.",
    )
    parser.add_argument("--dataset-path",
@@ -566,7 +646,7 @@ if __name__ == "__main__":
        "--tokenizer",
        type=str,
        help=
-        "Name or path of the tokenizer, if not using the default tokenizer.",
+        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument(
        "--best-of",
@@ -609,6 +689,27 @@ if __name__ == "__main__":
        help=
        "Number of prefix tokens per request, used only for sonnet dataset.",
    )
    parser.add_argument(
        "--random-input-len",
        type=int,
        default=1024,
        help=
        "Number of input tokens per request, used only for random sampling.",
    )
    parser.add_argument(
        "--random-output-len",
        type=int,
        default=128,
        help=
        "Number of output tokens per request, used only for random sampling.",
    )
    parser.add_argument(
        "--random-range-ratio",
        type=float,
        default=1.0,
        help="Range of sampled ratio of input/output length, "
        "used only for random sampling.",
    )
    parser.add_argument(
        "--request-rate",
        type=float,
@@ -629,6 +730,12 @@ if __name__ == "__main__":
        action="store_true",
        help="Specify to disable tqdm progress bar.",
    )
    parser.add_argument(
        "--profile",
        action="store_true",
        help="Use Torch Profiler. The endpoint must be launched with "
        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
    )
    parser.add_argument(
        "--save-result",
        action="store_true",
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -13,26 +13,25 @@ from weight_shapes import WEIGHT_SHAPES
 from vllm import _custom_ops as ops
 from vllm.utils import FlexibleArgumentParser
-DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
 DEFAULT_TP_SIZES = [1]
 # helpers
-def to_fp8(tensor: torch.tensor) -> torch.tensor:
+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
    finfo = torch.finfo(torch.float8_e4m3fn)
    return torch.round(tensor.clamp(
        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-def to_int8(tensor: torch.tensor) -> torch.tensor:
+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
 def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
-                      k: int) -> Tuple[torch.tensor, torch.tensor]:
+                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
    a = torch.randn((m, k), device='cuda') * 5
    b = torch.randn((n, k), device='cuda').t() * 5
@@ -44,59 +43,18 @@ def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
    raise ValueError("unsupported dtype")
 # impl
 def pytorch_mm_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
                    scale_b: torch.tensor,
                    out_dtype: torch.dtype) -> torch.tensor:
    return torch.mm(a, b)
 def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
                     scale_b: torch.tensor,
                     out_dtype: torch.dtype) -> torch.tensor:
    return torch._scaled_mm(a,
                            b,
                            scale_a=scale_a,
                            scale_b=scale_b,
                            out_dtype=out_dtype)
 def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
                                scale_a: torch.tensor, scale_b: torch.tensor,
                                out_dtype: torch.dtype) -> torch.tensor:
    return torch._scaled_mm(a,
                            b,
                            scale_a=scale_a,
                            scale_b=scale_b,
                            out_dtype=out_dtype,
                            use_fast_accum=True)
 def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
                 scale_b: torch.tensor,
                 out_dtype: torch.dtype) -> torch.tensor:
    return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype)
 # bench
-def bench_fn(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
+def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
-             scale_b: torch.tensor, out_dtype: torch.dtype, label: str,
+             **kwargs) -> TMeasurement:
             sub_label: str, fn: Callable, description: str) -> TMeasurement:
    min_run_time = 1
    globals = {
-        "a": a,
+        "args": args,
-        "b": b,
+        "kwargs": kwargs,
        "scale_a": scale_a,
        "scale_b": scale_b,
        "out_dtype": out_dtype,
        "fn": fn,
    }
    return TBenchmark.Timer(
-        stmt="fn(a, b, scale_a, scale_b, out_dtype)",
+        stmt="fn(*args, **kwargs)",
        globals=globals,
        label=label,
        sub_label=sub_label,
@@ -110,19 +68,58 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
    a, b = make_rand_tensors(torch.int8, m, n, k)
    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
    azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
    azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
    timers = []
-    # pytorch impl
+    # pytorch impl - bfloat16
    timers.append(
-        bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
+                 torch.mm, a.to(dtype=torch.bfloat16),
-                 torch.bfloat16, label, sub_label, pytorch_mm_impl,
+                 b.to(dtype=torch.bfloat16)))
-                 "pytorch_bf16_bf16_bf16_matmul-no-scales"))
+
    # pytorch impl - float16
    timers.append(
        bench_fn(label, sub_label,
                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
    # cutlass impl
    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
-                 cutlass_impl, "cutlass_i8_i8_bf16_scaled_mm"))
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
                 torch.bfloat16))
    # cutlass with bias
    timers.append(
        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
                 bias))
    # cutlass with azp per-tensor
    timers.append(
        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp",
                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
                 torch.bfloat16, azp_adj))
    # cutlass with azp per-tensor + bias
    timers.append(
        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias",
                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
                 torch.bfloat16, azp_adj, None, bias))
    # cutlass with azp per-token
    timers.append(
        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt",
                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
                 torch.bfloat16, azp_adj, azp))
    # cutlass with azp per-token + bias
    timers.append(
        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias",
                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
                 torch.bfloat16, azp_adj, azp, bias))
    return timers
@@ -133,46 +130,88 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
    a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
    timers = []
    # pytorch impl w. bf16
    timers.append(
-        bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
+                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
-                 torch.bfloat16, label, sub_label, pytorch_mm_impl,
+                 b.to(dtype=torch.bfloat16, device="cuda")))
                 "pytorch_bf16_bf16_bf16_matmul-no-scales"))
    # pytorch impl: bf16 output, without fp8 fast accum
    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
+        bench_fn(label,
-                 pytorch_fp8_impl, "pytorch_fp8_fp8_bf16_scaled_mm"))
+                 sub_label,
                 "pytorch_fp8_fp8_bf16_scaled_mm",
                 torch._scaled_mm,
                 a,
                 b,
                 scale_a=scale_a,
                 scale_b=scale_b,
                 out_dtype=torch.bfloat16))
    # pytorch impl: bf16 output, with fp8 fast accum
    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
+        bench_fn(label,
-                 pytorch_fp8_impl_fast_accum,
+                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum"))
+                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
                 torch._scaled_mm,
                 a,
                 b,
                 scale_a=scale_a,
                 scale_b=scale_b,
                 out_dtype=torch.bfloat16,
                 use_fast_accum=True))
    # pytorch impl: fp16 output, without fp8 fast accum
    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
+        bench_fn(label,
-                 pytorch_fp8_impl, "pytorch_fp8_fp8_fp16_scaled_mm"))
+                 sub_label,
                 "pytorch_fp8_fp8_fp16_scaled_mm",
                 torch._scaled_mm,
                 a,
                 b,
                 scale_a=scale_a,
                 scale_b=scale_b,
                 out_dtype=torch.float16))
    # pytorch impl: fp16 output, with fp8 fast accum
    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
+        bench_fn(label,
-                 pytorch_fp8_impl_fast_accum,
+                 sub_label,
-                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum"))
+                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
                 torch._scaled_mm,
                 a,
                 b,
                 scale_a=scale_a,
                 scale_b=scale_b,
                 out_dtype=torch.float16,
                 use_fast_accum=True))
    # cutlass impl: bf16 output
    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
-                 cutlass_impl, "cutlass_fp8_fp8_bf16_scaled_mm"))
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
                 torch.bfloat16))
    # cutlass impl: fp16 output
    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm",
-                 cutlass_impl, "cutlass_fp8_fp8_fp16_scaled_mm"))
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16))
    # cutlass impl: bf16 output, with bias
    timers.append(
        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias",
                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
                 bias))
    # cutlass impl: fp16 output, with bias
    timers.append(
        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias",
                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16,
                 bias.to(dtype=torch.float16)))
    return timers
@@ -193,7 +232,6 @@ def print_timers(timers: Iterable[TMeasurement]):
 def run(dtype: torch.dtype,
        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
    results = []
    for m, k, n in MKNs:
        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
@@ -209,7 +247,6 @@ def make_output(data: Iterable[TMeasurement],
                MKNs: Iterable[Tuple[int, int, int]],
                base_description: str,
                timestamp=None):
    print(f"== All Results {base_description} ====")
    print_timers(data)
@@ -244,7 +281,6 @@ def run_range_bench(args):
 def run_model_bench(args):
    print("Benchmarking models:")
    for i, model in enumerate(args.models):
        print(f"[{i}]  {model}")
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -0,0 +1,89 @@
 import random
 import time
 import torch
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
@torch.inference_mode()
 def main(num_tokens: int,
         hidden_size: int,
         add_residual: bool,
         dtype: torch.dtype,
         seed: int = 0,
         do_profile: bool = False,
         num_warmup_iters: int = 5,
         num_iters: int = 100) -> None:
    random.seed(seed)
    torch.random.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    torch.set_default_device("cuda")
    layer = RMSNorm(hidden_size).to(dtype=dtype)
    layer.weight.data.normal_(mean=1.0, std=0.1)
    scale = 1 / (2 * hidden_size)
    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
    x *= scale
    residual = torch.randn_like(x) * scale if add_residual else None
    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
        torch.cuda.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()
        for _ in range(num_iters):
            layer(x, residual)
        torch.cuda.synchronize()
        end_time = time.perf_counter()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        return (end_time - start_time) / num_iters
    # Warmup.
    print("Warming up...")
    run_benchmark = run_cuda_benchmark
    run_benchmark(num_iters=num_warmup_iters, profile=False)
    # Benchmark.
    if do_profile:
        latency = run_benchmark(num_iters=1, profile=True)
    else:
        latency = run_benchmark(num_iters=num_iters, profile=False)
    print(f"Kernel running time: {latency * 1000000:.3f} us")
 if __name__ == '__main__':
    parser = FlexibleArgumentParser(
        description="Benchmark the layernorm kernel.")
    parser.add_argument("--num-tokens", type=int, default=4096)
    parser.add_argument("--hidden-size", type=int, default=8192)
    parser.add_argument("--add-residual", action="store_true")
    parser.add_argument("--dtype",
                        type=str,
                        choices=["half", "bfloat16", "float"],
                        default="half")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--profile", action="store_true")
    parser.add_argument("--num-warmup-iters", type=int, default=5)
    parser.add_argument("--num-iters",
                        type=int,
                        default=100,
                        help="Number of benchmark iterations. "
                        "If --profile is set, this number is ignored")
    args = parser.parse_args()
    print(args)
    main(num_tokens=args.num_tokens,
         hidden_size=args.hidden_size,
         add_residual=args.add_residual,
         dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
         seed=args.seed,
         do_profile=args.profile,
         num_warmup_iters=args.num_warmup_iters,
         num_iters=args.num_iters)
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -0,0 +1,372 @@
 import argparse
 import copy
 import itertools
 import math
 import pickle as pkl
 import time
 from typing import Callable, Iterable, List, Tuple
 import torch
 import torch.utils.benchmark as TBenchmark
 from torch.utils.benchmark import Measurement as TMeasurement
 from weight_shapes import WEIGHT_SHAPES
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, marlin_permute_scales)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
    MarlinWorkspace)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    gptq_pack, pack_rows, quantize_weights)
 from vllm.scalar_type import ScalarType, scalar_types
 from vllm.utils import FlexibleArgumentParser
 DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
 DEFAULT_TP_SIZES = [1]
 def machete_pack_weights(w_q: torch.tensor, wtype: ScalarType) -> torch.tensor:
    w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
    w_q = w_q.t().contiguous().t()  # make col major
    return ops.machete_prepack_B(w_q, wtype)
 def make_bench_tensors(
    atype: torch.dtype, wtype: ScalarType, group_size: int, m: int, n: int,
    k: int
 ) -> Tuple[torch.tensor, List[Tuple[torch.tensor, torch.tensor, torch.tensor,
                                    torch.tensor]]]:
    assert wtype.is_integer(), "TODO: support floating point weights"
    # we want to make sure that weights don't fit into L2 cache between runs so
    #  we construct enough weights to exceed L2 cache, which is 50mb on a H100
    #  so we target total weight size > 2*50mb
    num_weights = math.ceil(2 * 50 * 1024**2 * 8 / (k * n * wtype.size_bits))
    a = torch.randn((m, k), device="cuda", dtype=atype) * 5
    weights = [
        torch.randn((k, n), device="cuda", dtype=atype)
        for _ in range(num_weights)
    ]
    quanitized_weights = [
        quantize_weights(w, wtype, group_size) for w in weights
    ]
    return a, quanitized_weights
 # impl
 # bench
 def bench_fn(label: str, sub_label: str, description: str,
             fn: Callable) -> TMeasurement:
    min_run_time = 1
    return TBenchmark.Timer(
        stmt="fn()",
        globals={
            "fn": fn
        },
        label=label,
        sub_label=sub_label,
        description=description,
    ).blocked_autorange(min_run_time=min_run_time)
 def loop_over_weights(
    a: torch.tensor, weights: List[Tuple[torch.tensor, torch.tensor,
                                         torch.tensor, torch.tensor]],
    fn: Callable[[torch.tensor, torch.tensor, torch.tensor, torch.tensor],
                 None]):
    for w_ref, w_q, w_s, _ in weights:
        fn(a, w_ref, w_q, w_s)
 def bench(atype: torch.dtype,
          wtype: ScalarType,
          group_size: int,
          m: int,
          k: int,
          n: int,
          label: str,
          sub_label: str,
          benchmark_marlinv1: bool = True,
          sweep_schedules: bool = True) -> Iterable[TMeasurement]:
    a, weights = make_bench_tensors(atype, wtype, group_size, m, n, k)
    sub_label += f", L={len(weights)}"
    weights_machete = [(w_ref, machete_pack_weights(w_q, wtype), w_s, w_zp)
                       for w_ref, w_q, w_s, w_zp in weights]
    timers = []
    # pytorch impl
    timers.append(
        bench_fn(
            label, sub_label, "torch.matmul", lambda: loop_over_weights(
                a,
                weights,
                lambda a, w_ref, w_q, w_s: torch.matmul(a, w_ref),
            )))
    if benchmark_marlinv1:
        w_ref = weights[0][0]
        w_zp_empty = torch.empty(0, dtype=torch.int, device=w_ref.device)
        sort_indices = torch.empty(0, dtype=torch.int, device=w_ref.device)
        g_idx = torch.empty(0, dtype=torch.int, device=w_ref.device)
        def marlinv1_pack_weights(w_q: torch.tensor) -> torch.tensor:
            w_q_gptq = gptq_pack(w_q, wtype.size_bits, *w_ref.shape)
            return ops.gptq_marlin_repack(w_q_gptq, sort_indices, *w_ref.shape,
                                          wtype.size_bits)
        def marlinv1_permute_scales(w_s: torch.tensor) -> torch.tensor:
            return marlin_permute_scales(w_s, *w_ref.shape, group_size)
        weights_marlinv1 = [(w_ref, marlinv1_pack_weights(w_q),
                             marlinv1_permute_scales(w_s), w_zp)
                            for w_ref, w_q, w_s, w_zp in weights]
        workspace = MarlinWorkspace(w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N,
                                    GPTQ_MARLIN_MAX_PARALLEL)
        # marlinv1
        timers.append(
            bench_fn(
                label, sub_label, "marlin_orig", lambda: loop_over_weights(
                    a, weights_marlinv1, lambda a, w_ref, w_q, w_s: ops.
                    gptq_marlin_gemm(a,
                                     w_q,
                                     w_s,
                                     w_zp_empty,
                                     g_idx,
                                     sort_indices,
                                     workspace.scratch,
                                     wtype,
                                     size_m=a.shape[0],
                                     size_n=w_ref.shape[1],
                                     size_k=w_ref.shape[0],
                                     is_k_full=True))))
    # machete
    timers.append(
        bench_fn(
            label, sub_label, "machete_heuristic", lambda: loop_over_weights(
                a, weights_machete, lambda a, _, w_q, w_s: ops.machete_gemm(
                    a, w_q, wtype, b_scales=w_s, b_group_size=group_size))))
    if sweep_schedules:
        print("Finding best schedule for machete")
        best = None
        best_schedule = None
        schedules = ops.machete_supported_schedules(wtype)
        for schedule in reversed(schedules):
            def run(a, _, w_q, w_s, schedule=schedule):
                ops.machete_gemm(a,
                                 w_q,
                                 wtype,
                                 w_s,
                                 b_group_size=group_size,
                                 schedule=schedule)
            res = bench_fn(label, sub_label, "machete_best",
                           lambda: loop_over_weights(a, weights_machete, run))
            print(f"  {res.median:5.5} ", schedule)
            if not best or res.median < best.median:
                best = res
                best_schedule = schedule
        print("Best schedule:", best_schedule)
        timers.append(best)
    return timers
 # runner
 def print_timers(timers: Iterable[TMeasurement]):
    compare = TBenchmark.Compare(timers)
    compare.print()
 def run(dtype: torch.dtype, sweep_schedules: bool,
        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
    results = []
    for m, k, n in MKNs:
        timers = bench(dtype,
                       scalar_types.uint4b8,
                       128,
                       m,
                       k,
                       n,
                       f"{dtype}-gemm",
                       f"MKN=({m}x{k}x{n})",
                       sweep_schedules=sweep_schedules)
        print_timers(timers)
        results.extend(timers)
    return results
 # output makers
 def make_output(
    data: Iterable[TMeasurement],
    MKNs: Iterable[Tuple[int, int, int]],
    base_description: str,
    timestamp=None,
 ):
    print(f"== All Results {base_description} ====")
    print_timers(data)
    # pickle all the results
    timestamp = int(time.time()) if timestamp is None else timestamp
    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
        pkl.dump(data, f)
 # argparse runners
 def run_square_bench(args):
    dim_sizes = list(
        range(args.dim_start, args.dim_end + 1, args.dim_increment))
    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
    data = run(args.dtype, args.sweep_schedules, MKNs)
    make_output(data, MKNs, f"square_bench-{args.dtype}")
 def run_range_bench(args):
    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
    n = len(dim_sizes)
    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
    MKNs = list(zip(Ms, Ks, Ns))
    data = run(args.dtype, args.sweep_schedules, MKNs)
    make_output(data, MKNs, f"range_bench-{args.dtype}")
 def run_model_bench(args):
    print("Benchmarking models:")
    for i, model in enumerate(args.models):
        print(f"[{i}]  {model}")
    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
        KNs = []
        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
            KNs.append(KN)
        return KNs
    model_bench_data = []
    models_tps = list(itertools.product(args.models, args.tp_sizes))
    for model, tp_size in models_tps:
        Ms = args.batch_sizes
        KNs = model_shapes(model, tp_size)
        MKNs = []
        for m in Ms:
            for k, n in KNs:
                MKNs.append((m, k, n))
        data = run(args.dtype, args.sweep_schedules, MKNs)
        model_bench_data.append(data)
    # Print all results
    for data, model_tp in zip(model_bench_data, models_tps):
        model, tp_size = model_tp
        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
        print_timers(data)
    timestamp = int(time.time())
    all_data = []
    for d in model_bench_data:
        all_data.extend(d)
    # pickle all data
    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
        pkl.dump(all_data, f)
 if __name__ == "__main__":
    def to_torch_dtype(dt):
        if dt == "bfloat16":
            return torch.bfloat16
        if dt == "float16":
            return torch.float16
        raise ValueError("unsupported dtype")
    parser = FlexibleArgumentParser(
        description="""
 Benchmark Machete GEMM.
    To run square GEMMs:
        python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
    To run constant N and K and sweep M:
        python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
    To run dimensions from a model:
        python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
    Output:
        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
            """,  # noqa: E501
        formatter_class=argparse.RawTextHelpFormatter,
    )
    parser.add_argument(
        "--dtype",
        type=to_torch_dtype,
        required=True,
        help="Available options are ['bfloat16', 'float16']",
    )
    parser.add_argument(
        "--sweep-schedules",
        action="store_true",
        help="Run a sweep over all supported schedules",
    )
    subparsers = parser.add_subparsers(dest="cmd", required=True)
    square_parser = subparsers.add_parser("square_bench")
    square_parser.add_argument("--dim-start", type=int, required=True)
    square_parser.add_argument("--dim-end", type=int, required=True)
    square_parser.add_argument("--dim-increment", type=int, required=True)
    square_parser.set_defaults(func=run_square_bench)
    range_parser = subparsers.add_parser("range_bench")
    range_parser.add_argument("--dim-start", type=int, required=True)
    range_parser.add_argument("--dim-end", type=int, required=True)
    range_parser.add_argument("--dim-increment", type=int, required=True)
    range_parser.add_argument("--m-constant", type=int, default=None)
    range_parser.add_argument("--n-constant", type=int, default=None)
    range_parser.add_argument("--k-constant", type=int, default=None)
    range_parser.set_defaults(func=run_range_bench)
    model_parser = subparsers.add_parser("model_bench")
    model_parser.add_argument(
        "--models",
        nargs="+",
        type=str,
        default=DEFAULT_MODELS,
        choices=WEIGHT_SHAPES.keys(),
    )
    model_parser.add_argument("--tp-sizes",
                              nargs="+",
                              type=int,
                              default=DEFAULT_TP_SIZES)
    model_parser.add_argument("--batch-sizes",
                              nargs="+",
                              type=int,
                              default=DEFAULT_BATCH_SIZES)
    model_parser.set_defaults(func=run_model_bench)
    args = parser.parse_args()
    args.func(args)
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -5,16 +5,19 @@ import torch.utils.benchmark as benchmark
 from benchmark_shapes import WEIGHT_SHAPES
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.gptq_marlin import (
    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
    GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
    GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
-    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
+    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    MarlinWorkspace, marlin_24_quantize, marlin_quantize)
+    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
    MARLIN_SUPPORTED_GROUP_SIZES, query_marlin_supported_quant_types)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
    MarlinWorkspace, marlin_quantize)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
    marlin_24_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    gptq_pack, quantize_weights, sort_weights)
+    gptq_pack, gptq_quantize_weights, sort_weights)
 from vllm.scalar_type import ScalarType
 from vllm.utils import FlexibleArgumentParser
 DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
@@ -25,13 +28,14 @@ K_FULL_OPTS = [False, True]
 def bench_run(results: List[benchmark.Measurement], model: str,
-              act_order: bool, is_k_full: bool, num_bits: int, group_size: int,
+              act_order: bool, is_k_full: bool, quant_type: ScalarType,
-              size_m: int, size_k: int, size_n: int):
+              group_size: int, size_m: int, size_k: int, size_n: int):
    label = "Quant Matmul"
-    sub_label = ("{}, act={} k_full={}, b={}, g={}, "
+    sub_label = ("{}, act={} k_full={}, q={}, g={}, "
-                 "MKN=({}x{}x{})".format(model, act_order, is_k_full, num_bits,
+                 "MKN=({}x{}x{})".format(model, act_order, is_k_full,
-                                         group_size, size_m, size_k, size_n))
+                                         str(quant_type), group_size, size_m,
                                         size_k, size_n))
    print(f"Testing: {sub_label}")
@@ -48,16 +52,18 @@ def bench_run(results: List[benchmark.Measurement], model: str,
        marlin_g_idx,
        marlin_sort_indices,
        marlin_rand_perm,
-    ) = marlin_quantize(b, num_bits, group_size, act_order)
+    ) = marlin_quantize(b, quant_type, group_size, act_order)
    # Marlin_24 quant
    (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta,
-     marlin_24_s) = marlin_24_quantize(b, num_bits, group_size)
+     marlin_24_s) = marlin_24_quantize(b, quant_type, group_size)
    marlin_zp = torch.empty(0, dtype=torch.int, device=b.device)
    # GPTQ quant
    (w_ref, q_w, s, g_idx,
-     rand_perm) = quantize_weights(b, num_bits, group_size, act_order)
+     rand_perm) = gptq_quantize_weights(b, quant_type, group_size, act_order)
-    q_w_gptq = gptq_pack(q_w, num_bits, size_k, size_n)
+    q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
    # For act_order, sort the "weights" and "g_idx"
    # so that group ids are increasing
@@ -71,10 +77,11 @@ def bench_run(results: List[benchmark.Measurement], model: str,
    marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
                                          GPTQ_MARLIN_24_MAX_PARALLEL)
    marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int)
    globals = {
        # Gen params
-        "num_bits": num_bits,
+        "quant_type": quant_type,
        "group_size": group_size,
        "size_m": size_m,
        "size_n": size_n,
@@ -85,6 +92,7 @@ def bench_run(results: List[benchmark.Measurement], model: str,
        "marlin_w_ref": marlin_w_ref,
        "marlin_q_w": marlin_q_w,
        "marlin_s": marlin_s,
        "marlin_zp": marlin_zp,
        "marlin_g_idx": marlin_g_idx,
        "marlin_sort_indices": marlin_sort_indices,
        "marlin_rand_perm": marlin_rand_perm,
@@ -123,19 +131,29 @@ def bench_run(results: List[benchmark.Measurement], model: str,
    results.append(
        benchmark.Timer(
            stmt=
-            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full)",  # noqa: E501
+            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
-            description="gptq_marlin_gemm",
+            description="gptq_marlin_gemm_fp16",
        ).blocked_autorange(min_run_time=min_run_time))
-    if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
+    results.append(
        benchmark.Timer(
            stmt=
            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
            description="gptq_marlin_gemm_fp32",
        ).blocked_autorange(min_run_time=min_run_time))
    if (quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
            and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES):
        results.append(
            benchmark.Timer(
                stmt=
-                "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, num_bits, size_m, size_n, size_k)",  # noqa: E501
+                "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)",  # noqa: E501
                globals=globals,
                label=label,
                sub_label=sub_label,
@@ -145,7 +163,7 @@ def bench_run(results: List[benchmark.Measurement], model: str,
    results.append(
        benchmark.Timer(
            stmt=
-            "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, num_bits)",  # noqa: E501
+            "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
@@ -181,12 +199,13 @@ def main(args):
                           ) > 0 and is_k_full not in args.limit_k_full:
                        continue
-                    for num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
+                    for quant_type in query_marlin_supported_quant_types(
-                        if len(args.limit_num_bits
+                            False):
-                               ) > 0 and num_bits not in args.limit_num_bits:
+                        if len(args.limit_num_bits) > 0 and \
                            quant_type.size_bits not in args.limit_num_bits:
                            continue
-                        for group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
+                        for group_size in MARLIN_SUPPORTED_GROUP_SIZES:
                            if len(
                                    args.limit_group_size
                            ) > 0 and group_size not in args.limit_group_size:
@@ -200,8 +219,8 @@ def main(args):
                            for size_m in args.batch_sizes:
                                bench_run(results, model, act_order, is_k_full,
-                                          num_bits, group_size, size_m, size_k,
+                                          quant_type, group_size, size_m,
-                                          size_n)
+                                          size_k, size_n)
    compare = benchmark.Compare(results)
    compare.print()
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -30,19 +30,36 @@ def benchmark_config(
    hidden_size: int,
    topk: int,
    dtype: torch.dtype,
-    use_fp8: bool,
+    use_fp8_w8a8: bool,
    use_int8_w8a16: bool,
    num_iters: int = 100,
 ) -> float:
-    init_dtype = torch.float16 if use_fp8 else dtype
+    init_dtype = torch.float16 if use_fp8_w8a8 else dtype
    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
-    w1 = torch.randn(num_experts,
+    if use_int8_w8a16:
-                     shard_intermediate_size,
+        w1 = torch.randint(-127,
-                     hidden_size,
+                           127, (
-                     dtype=init_dtype)
+                               num_experts,
-    w2 = torch.randn(num_experts,
+                               shard_intermediate_size,
-                     hidden_size,
+                               hidden_size,
-                     shard_intermediate_size // 2,
+                           ),
-                     dtype=init_dtype)
+                           dtype=torch.int8)
        w2 = torch.randint(-127,
                           127, (
                               num_experts,
                               hidden_size,
                               shard_intermediate_size // 2,
                           ),
                           dtype=torch.int8)
    else:
        w1 = torch.randn(num_experts,
                         shard_intermediate_size,
                         hidden_size,
                         dtype=init_dtype)
        w2 = torch.randn(num_experts,
                         hidden_size,
                         shard_intermediate_size // 2,
                         dtype=init_dtype)
    gating_output = torch.randn(num_iters,
                                num_tokens,
                                num_experts,
@@ -52,7 +69,11 @@ def benchmark_config(
    w2_scale = None
    a1_scale = None
    a2_scale = None
-    if use_fp8:
+    if use_int8_w8a16:
        w1_scale = torch.randn((num_experts, 2 * shard_intermediate_size),
                               dtype=torch.float32)
        w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32)
    if use_fp8_w8a8:
        w1_scale = torch.randn(num_experts, dtype=torch.float32)
        w2_scale = torch.randn(num_experts, dtype=torch.float32)
        a1_scale = torch.randn(1, dtype=torch.float32)
@@ -76,7 +97,8 @@ def benchmark_config(
            renormalize=True,
            inplace=True,
            override_config=config,
-            use_fp8=use_fp8,
+            use_fp8_w8a8=use_fp8_w8a8,
            use_int8_w8a16=use_int8_w8a16,
            w1_scale=w1_scale,
            w2_scale=w2_scale,
            a1_scale=a1_scale,
@@ -155,11 +177,13 @@ class BenchmarkWorker:
        hidden_size: int,
        topk: int,
        dtype: torch.dtype,
-        use_fp8: bool,
+        use_fp8_w8a8: bool,
        use_int8_w8a16: bool,
    ) -> Tuple[Dict[str, int], float]:
        torch.cuda.manual_seed_all(self.seed)
-
+        dtype_str = get_config_dtype_str(dtype,
-        dtype_str = "float8" if use_fp8 else None
+                                         use_int8_w8a16=use_int8_w8a16,
                                         use_fp8_w8a8=use_fp8_w8a8)
        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
        # is the intermediate size after silu_and_mul.
        op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
@@ -173,7 +197,8 @@ class BenchmarkWorker:
                                   key=lambda x: abs(x - num_tokens))]
        kernel_time = benchmark_config(config, num_tokens, num_experts,
                                       shard_intermediate_size, hidden_size,
-                                       topk, dtype, use_fp8)
+                                       topk, dtype, use_fp8_w8a8,
                                       use_int8_w8a16)
        return config, kernel_time
    def tune(
@@ -184,9 +209,10 @@ class BenchmarkWorker:
        hidden_size: int,
        topk: int,
        dtype: torch.dtype,
-        use_fp8: bool,
+        use_fp8_w8a8: bool,
-        search_space: List[BenchmarkConfig],
+        use_int8_w8a16: bool,
-    ) -> BenchmarkConfig:
+        search_space: List[Dict[str, int]],
    ) -> Dict[str, int]:
        best_config = None
        best_time = float("inf")
        for config in tqdm(search_space):
@@ -198,7 +224,8 @@ class BenchmarkWorker:
                                               hidden_size,
                                               topk,
                                               dtype,
-                                               use_fp8,
+                                               use_fp8_w8a8,
                                               use_int8_w8a16,
                                               num_iters=10)
            except triton.runtime.autotuner.OutOfResources:
                # Some configurations may be invalid and fail to compile.
@@ -224,20 +251,19 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
    }
-def save_configs(
+def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int,
-    configs: Dict[int, BenchmarkConfig],
+                 shard_intermediate_size: int, hidden_size: int, topk: int,
-    num_experts: int,
+                 dtype: torch.dtype, use_fp8_w8a8: bool,
-    shard_intermediate_size: int,
+                 use_int8_w8a16: bool) -> None:
-    hidden_size: int,
+    dtype_str = get_config_dtype_str(dtype,
-    topk: int,
+                                     use_int8_w8a16=use_int8_w8a16,
-    dtype: torch.dtype,
+                                     use_fp8_w8a8=use_fp8_w8a8)
-    use_fp8: bool,
+
 ) -> None:
    dtype_str = "float8" if use_fp8 else None
    # NOTE(woosuk): The current naming convention uses w2.shape[2], which
    # is the intermediate size after silu_and_mul.
    filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
                                    dtype_str)
    print(f"Writing best config to {filename}...")
    with open(filename, "w") as f:
        json.dump(configs, f, indent=4)
@@ -253,6 +279,11 @@ def main(args: argparse.Namespace):
        topk = config.ffn_config.moe_top_k
        intermediate_size = config.ffn_config.ffn_hidden_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
    elif config.architectures[0] == "JambaForCausalLM":
        E = config.num_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.intermediate_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
    else:
        # Default: Mixtral.
        E = config.num_local_experts
@@ -262,7 +293,8 @@ def main(args: argparse.Namespace):
    hidden_size = config.hidden_size
    dtype = config.torch_dtype
-    use_fp8 = args.dtype == "fp8"
+    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
    if args.batch_size is None:
        batch_sizes = [
@@ -294,21 +326,21 @@ def main(args: argparse.Namespace):
        start = time.time()
        configs = _distribute(
            "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
-                      topk, dtype, use_fp8, search_space)
+                      topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space)
                     for batch_size in batch_sizes])
        best_configs = {
            M: sort_config(config)
            for M, config in zip(batch_sizes, configs)
        }
        save_configs(best_configs, E, shard_intermediate_size, hidden_size,
-                     topk, dtype, use_fp8)
+                     topk, dtype, use_fp8_w8a8, use_int8_w8a16)
        end = time.time()
        print(f"Tuning took {end - start:.2f} seconds")
    else:
-        outputs = _distribute("benchmark",
+        outputs = _distribute(
-                              [(batch_size, E, shard_intermediate_size,
+            "benchmark", [(batch_size, E, shard_intermediate_size, hidden_size,
-                                hidden_size, topk, dtype, use_fp8)
+                           topk, dtype, use_fp8_w8a8, use_int8_w8a16)
-                               for batch_size in batch_sizes])
+                          for batch_size in batch_sizes])
        for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
            print(f"Batch size: {batch_size}, config: {config}")
@@ -323,7 +355,7 @@ if __name__ == "__main__":
    parser.add_argument("--tp-size", "-tp", type=int, default=2)
    parser.add_argument("--dtype",
                        type=str,
-                        choices=["auto", "fp8"],
+                        choices=["auto", "fp8_w8a8", "int8_w8a16"],
                        default="auto")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--batch-size", type=int, required=False)
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -100,7 +100,7 @@ def main(
        start_time = time.perf_counter()
        # Using default kv_scale
-        kv_scale = 1.0
+        k_scale = v_scale = 1.0
        for _ in range(num_iters):
            if version == "v1":
@@ -117,7 +117,8 @@ def main(
                    max_seq_len,
                    alibi_slopes,
                    kv_cache_dtype,
-                    kv_scale,
+                    k_scale,
                    v_scale,
                )
            elif version == "v2":
                ops.paged_attention_v2(
@@ -136,7 +137,8 @@ def main(
                    max_seq_len,
                    alibi_slopes,
                    kv_cache_dtype,
-                    kv_scale,
+                    k_scale,
                    v_scale,
                )
            else:
                raise ValueError(f"Invalid version: {version}")
@@ -173,7 +175,7 @@ if __name__ == '__main__':
    parser.add_argument("--num-kv-heads", type=int, default=8)
    parser.add_argument("--head-size",
                        type=int,
-                        choices=[64, 80, 96, 112, 128, 192, 256],
+                        choices=[64, 80, 96, 112, 120, 128, 192, 256],
                        default=128)
    parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
    parser.add_argument("--use-alibi", action="store_true")
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -0,0 +1,103 @@
 import random
 import time
 import torch
 from vllm import _custom_ops as ops
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
@torch.inference_mode()
 def main(num_tokens: int,
         hidden_size: int,
         static_scale: bool,
         quant_dtype: torch.dtype,
         dtype: torch.dtype,
         seed: int = 0,
         do_profile: bool = False,
         num_warmup_iters: int = 5,
         num_iters: int = 100) -> None:
    random.seed(seed)
    torch.random.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    torch.set_default_device("cuda")
    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
    scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None
    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
        torch.cuda.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()
        for _ in range(num_iters):
            if quant_dtype == torch.int8:
                ops.scaled_int8_quant(x, scale)
            else:
                ops.scaled_fp8_quant(x, scale)
        torch.cuda.synchronize()
        end_time = time.perf_counter()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        return (end_time - start_time) / num_iters
    # Warmup.
    print("Warming up...")
    run_benchmark = run_cuda_benchmark
    run_benchmark(num_iters=num_warmup_iters, profile=False)
    # Benchmark.
    if do_profile:
        latency = run_benchmark(num_iters=1, profile=True)
    else:
        latency = run_benchmark(num_iters=num_iters, profile=False)
    print(f"Kernel running time: {latency * 1000000:.3f} us")
 if __name__ == '__main__':
    def to_torch_dtype(dt):
        if dt == "int8":
            return torch.int8
        if dt == "fp8":
            return torch.float8_e4m3fn
        raise ValueError(f"Unsupported dtype: {dt}")
    parser = FlexibleArgumentParser(
        description="Benchmark the quantization (fp8 or int8) kernel.")
    parser.add_argument("--num-tokens", type=int, default=4096)
    parser.add_argument("--hidden-size", type=int, default=8192)
    parser.add_argument("--static-scale", action="store_true")
    parser.add_argument("--quant-dtype",
                        type=str,
                        choices=["fp8", "int8"],
                        default="int8")
    parser.add_argument("--dtype",
                        type=str,
                        choices=["half", "bfloat16", "float"],
                        default="half")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--profile", action="store_true")
    parser.add_argument("--num-warmup-iters", type=int, default=5)
    parser.add_argument("--num-iters",
                        type=int,
                        default=100,
                        help="Number of benchmark iterations. "
                        "If --profile is set, this number is ignored")
    args = parser.parse_args()
    print(args)
    main(num_tokens=args.num_tokens,
         hidden_size=args.hidden_size,
         static_scale=args.static_scale,
         quant_dtype=to_torch_dtype(args.quant_dtype),
         dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
         seed=args.seed,
         do_profile=args.profile,
         num_warmup_iters=args.num_warmup_iters,
         num_iters=args.num_iters)
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -94,7 +94,7 @@ if __name__ == '__main__':
    parser.add_argument("--num-heads", type=int, default=8)
    parser.add_argument("--head-size",
                        type=int,
-                        choices=[64, 80, 96, 112, 128, 192, 256],
+                        choices=[64, 80, 96, 112, 120, 128, 192, 256],
                        default=128)
    parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
    parser.add_argument("--dtype",
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@@ -0,0 +1,64 @@
 import math
 import pickle
 import re
 from collections import defaultdict
 from typing import List
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 from torch.utils.benchmark import Measurement as TMeasurement
 from vllm.utils import FlexibleArgumentParser
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description='Benchmark the latency of processing a single batch of '
        'requests till completion.')
    parser.add_argument('filename', type=str)
    args = parser.parse_args()
    with open(args.filename, 'rb') as f:
        data: List[TMeasurement] = pickle.load(f)
    results = defaultdict(lambda: list())
    for v in data:
        result = re.search(r"MKN=\(\d+x(\d+x\d+)\)", v.task_spec.sub_label)
        if result is not None:
            KN = result.group(1)
        else:
            raise Exception("MKN not found")
        result = re.search(r"MKN=\((\d+)x\d+x\d+\)", v.task_spec.sub_label)
        if result is not None:
            M = result.group(1)
        else:
            raise Exception("MKN not found")
        kernel = v.task_spec.description
        results[KN].append({
            "kernel": kernel,
            "batch_size": M,
            "median": v.median
        })
    rows = int(math.ceil(len(results) / 2))
    fig, axs = plt.subplots(rows, 2, figsize=(12, 5 * rows))
    axs = axs.flatten()
    axs_idx = 0
    for shape, data in results.items():
        plt.sca(axs[axs_idx])
        df = pd.DataFrame(data)
        sns.lineplot(data=df,
                     x="batch_size",
                     y="median",
                     hue="kernel",
                     style="kernel",
                     markers=True,
                     dashes=False,
                     palette="Dark2")
        plt.title(f"Shape: {shape}")
        plt.ylabel("time (median, s)")
        axs_idx += 1
    plt.tight_layout()
    plt.savefig("graph_machete_bench.pdf")
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@@ -0,0 +1,43 @@
 # Weight Shapes are in the format
 # ([K, N], TP_SPLIT_DIM)
 # Example:
 #  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
 #   - TP1 : K = 14336, N = 4096
 #   - TP2 : K = 7168, N = 4096
 #  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
 #   - TP1 : K = 4096, N = 6144
 #   - TP4 : K = 4096, N = 1536
 # TP1 shapes
 WEIGHT_SHAPES = {
    "mistralai/Mistral-7B-v0.1": [
        ([4096, 6144], 1),
        ([4096, 4096], 0),
        ([4096, 28672], 1),
        ([14336, 4096], 0),
    ],
    "meta-llama/Llama-2-7b-hf": [
        ([4096, 12288], 1),
        ([4096, 4096], 0),
        ([4096, 22016], 1),
        ([11008, 4096], 0),
    ],
    "meta-llama/Llama-3-8b": [
        ([4096, 6144], 1),
        ([4096, 4096], 0),
        ([4096, 28672], 1),
        ([14336, 4096], 0),
    ],
    "meta-llama/Llama-2-13b-hf": [
        ([5120, 15360], 1),
        ([5120, 5120], 0),
        ([5120, 27648], 1),
        ([13824, 5120], 0),
    ],
    "meta-llama/Llama-2-70b-hf": [
        ([8192, 10240], 1),
        ([8192, 8192], 0),
        ([8192, 57344], 1),
        ([28672, 8192], 0),
    ],
 }
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -83,6 +83,8 @@ endif()
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
 list(APPEND LIBS "numa")
 #
 # Define extension targets
@@ -95,6 +97,7 @@ set(VLLM_EXT_SRC
    "csrc/cpu/activation.cpp"
    "csrc/cpu/attention.cpp"
    "csrc/cpu/cache.cpp"
    "csrc/cpu/utils.cpp"
    "csrc/cpu/layernorm.cpp"
    "csrc/cpu/pos_encoding.cpp"
    "csrc/cpu/torch_bindings.cpp")
@@ -104,11 +107,11 @@ define_gpu_extension_target(
    DESTINATION vllm
    LANGUAGE CXX
    SOURCES ${VLLM_EXT_SRC}
    LIBRARIES ${LIBS}
    COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
    USE_SABI 3
    WITH_SOABI
 )
 add_custom_target(default)
 message(STATUS "Enabling C extension.")
 add_dependencies(default _C)
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -181,7 +181,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
    #
    # The torch cmake setup hardcodes the detected architecture flags in
    # `CMAKE_CUDA_FLAGS`.  Since `CMAKE_CUDA_FLAGS` is a "global" variable, it
-    # can't modified on a per-target basis, e.g. for the `punica` extension.
+    # can't modified on a per-target basis.
    # So, all the `-gencode` flags need to be extracted and removed from
    # `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method.
    # Since it's not possible to use `target_compiler_options` for adding target
--- a/collect_env.py
+++ b/collect_env.py
@@ -65,6 +65,9 @@ DEFAULT_CONDA_PATTERNS = {
    "optree",
    "nccl",
    "transformers",
    "zmq",
    "nvidia",
    "pynvml",
 }
 DEFAULT_PIP_PATTERNS = {
@@ -77,6 +80,9 @@ DEFAULT_PIP_PATTERNS = {
    "onnx",
    "nccl",
    "transformers",
    "zmq",
    "nvidia",
    "pynvml",
 }
@@ -263,8 +269,9 @@ def get_neuron_sdk_version(run_lambda):
 def get_vllm_version():
    try:
        import vllm
-        return vllm.__version__
+        return vllm.__version__ + "@" + vllm.__commit__
-    except ImportError:
+    except Exception:
        # old version of vllm does not have __commit__
        return 'N/A'
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -105,9 +105,9 @@ __device__ void paged_attention_kernel(
    const int max_num_blocks_per_seq,
    const float* __restrict__ alibi_slopes,  // [num_heads]
    const int q_stride, const int kv_block_stride, const int kv_head_stride,
-    const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
+    const float k_scale, const float v_scale, const int tp_rank,
-    const int blocksparse_vert_stride, const int blocksparse_block_size,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
-    const int blocksparse_head_sliding_step) {
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
  const int seq_idx = blockIdx.y;
  const int partition_idx = blockIdx.z;
  const int max_num_partitions = gridDim.z;
@@ -285,7 +285,7 @@ __device__ void paged_attention_kernel(
          Quant_vec k_vec_quant = *reinterpret_cast<const Quant_vec*>(
              k_ptr + offset1 * BLOCK_SIZE * x + offset2);
          k_vecs[j] = fp8::scaled_convert<K_vec, Quant_vec, KV_DTYPE>(
-              k_vec_quant, kv_scale);
+              k_vec_quant, k_scale);
        }
      }
@@ -415,7 +415,7 @@ __device__ void paged_attention_kernel(
              *reinterpret_cast<const V_quant_vec*>(v_ptr + offset);
          // Vector conversion from V_quant_vec to V_vec.
          v_vec = fp8::scaled_convert<V_vec, V_quant_vec, KV_DTYPE>(v_quant_vec,
-                                                                    kv_scale);
+                                                                    v_scale);
        }
        if (block_idx == num_seq_blocks - 1) {
          // NOTE(woosuk): When v_vec contains the tokens that are out of the
@@ -513,15 +513,15 @@ __global__ void paged_attention_v1_kernel(
    const int max_num_blocks_per_seq,
    const float* __restrict__ alibi_slopes,  // [num_heads]
    const int q_stride, const int kv_block_stride, const int kv_head_stride,
-    const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
+    const float k_scale, const float v_scale, const int tp_rank,
-    const int blocksparse_vert_stride, const int blocksparse_block_size,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
-    const int blocksparse_head_sliding_step) {
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
  paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,
                         KV_DTYPE, IS_BLOCK_SPARSE>(
      /* exp_sums */ nullptr, /* max_logits */ nullptr, out, q, k_cache,
      v_cache, num_kv_heads, scale, block_tables, seq_lens,
      max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride,
-      kv_head_stride, kv_scale, tp_rank, blocksparse_local_blocks,
+      kv_head_stride, k_scale, v_scale, tp_rank, blocksparse_local_blocks,
      blocksparse_vert_stride, blocksparse_block_size,
      blocksparse_head_sliding_step);
 }
@@ -549,14 +549,14 @@ __global__ void paged_attention_v2_kernel(
    const int max_num_blocks_per_seq,
    const float* __restrict__ alibi_slopes,  // [num_heads]
    const int q_stride, const int kv_block_stride, const int kv_head_stride,
-    const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
+    const float k_scale, const float v_scale, const int tp_rank,
-    const int blocksparse_vert_stride, const int blocksparse_block_size,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
-    const int blocksparse_head_sliding_step) {
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
  paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,
                         KV_DTYPE, IS_BLOCK_SPARSE, PARTITION_SIZE>(
      exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
      block_tables, seq_lens, max_num_blocks_per_seq, alibi_slopes, q_stride,
-      kv_block_stride, kv_head_stride, kv_scale, tp_rank,
+      kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank,
      blocksparse_local_blocks, blocksparse_vert_stride, blocksparse_block_size,
      blocksparse_head_sliding_step);
 }
@@ -682,7 +682,7 @@ __global__ void paged_attention_v2_reduce_kernel(
          out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \
          scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq,    \
          alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,      \
-          kv_scale, tp_rank, blocksparse_local_blocks,                      \
+          k_scale, v_scale, tp_rank, blocksparse_local_blocks,              \
          blocksparse_vert_stride, blocksparse_block_size,                  \
          blocksparse_head_sliding_step);
@@ -694,8 +694,8 @@ void paged_attention_v1_launcher(
    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, int num_kv_heads, float scale,
    torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes, float kv_scale,
+    const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
-    const int tp_rank, const int blocksparse_local_blocks,
+    float v_scale, const int tp_rank, const int blocksparse_local_blocks,
    const int blocksparse_vert_stride, const int blocksparse_block_size,
    const int blocksparse_head_sliding_step) {
  int num_seqs = query.size(0);
@@ -706,7 +706,7 @@ void paged_attention_v1_launcher(
  int kv_block_stride = key_cache.stride(0);
  int kv_head_stride = key_cache.stride(1);
-  int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
+  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
  assert(head_size % thread_group_size == 0);
  // NOTE: alibi_slopes is optional.
@@ -751,6 +751,9 @@ void paged_attention_v1_launcher(
    case 112:
      LAUNCH_PAGED_ATTENTION_V1(112);
      break;
    case 120:
      LAUNCH_PAGED_ATTENTION_V1(120);
      break;
    case 128:
      LAUNCH_PAGED_ATTENTION_V1(128);
      break;
@@ -770,7 +773,7 @@ void paged_attention_v1_launcher(
  paged_attention_v1_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE,              \
                              IS_BLOCK_SPARSE>(                              \
      out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
-      seq_lens, max_seq_len, alibi_slopes, kv_scale, tp_rank,                \
+      seq_lens, max_seq_len, alibi_slopes, k_scale, v_scale, tp_rank,        \
      blocksparse_local_blocks, blocksparse_vert_stride,                     \
      blocksparse_block_size, blocksparse_head_sliding_step);
@@ -815,8 +818,8 @@ void paged_attention_v1(
    torch::Tensor& seq_lens,      // [num_seqs]
    int64_t block_size, int64_t max_seq_len,
    const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const std::string& kv_cache_dtype, double k_scale, double v_scale,
-    const int64_t blocksparse_local_blocks,
+    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
    const int64_t blocksparse_head_sliding_step) {
  const bool is_block_sparse = (blocksparse_vert_stride > 1);
@@ -833,7 +836,7 @@ void paged_attention_v1(
          exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \
          value_cache_ptr, num_kv_heads, scale, block_tables_ptr,              \
          seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride,    \
-          kv_block_stride, kv_head_stride, kv_scale, tp_rank,                  \
+          kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank,          \
          blocksparse_local_blocks, blocksparse_vert_stride,                   \
          blocksparse_block_size, blocksparse_head_sliding_step);              \
  vllm::paged_attention_v2_reduce_kernel<T, HEAD_SIZE, NUM_THREADS,            \
@@ -850,8 +853,8 @@ void paged_attention_v2_launcher(
    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, int num_kv_heads, float scale,
    torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes, float kv_scale,
+    const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
-    const int tp_rank, const int blocksparse_local_blocks,
+    float v_scale, const int tp_rank, const int blocksparse_local_blocks,
    const int blocksparse_vert_stride, const int blocksparse_block_size,
    const int blocksparse_head_sliding_step) {
  int num_seqs = query.size(0);
@@ -862,7 +865,7 @@ void paged_attention_v2_launcher(
  int kv_block_stride = key_cache.stride(0);
  int kv_head_stride = key_cache.stride(1);
-  int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
+  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
  assert(head_size % thread_group_size == 0);
  // NOTE: alibi_slopes is optional.
@@ -912,6 +915,9 @@ void paged_attention_v2_launcher(
    case 112:
      LAUNCH_PAGED_ATTENTION_V2(112);
      break;
    case 120:
      LAUNCH_PAGED_ATTENTION_V2(120);
      break;
    case 128:
      LAUNCH_PAGED_ATTENTION_V2(128);
      break;
@@ -932,8 +938,9 @@ void paged_attention_v2_launcher(
                              IS_BLOCK_SPARSE>(                               \
      out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,      \
      num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \
-      kv_scale, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride,   \
+      k_scale, v_scale, tp_rank, blocksparse_local_blocks,                    \
-      blocksparse_block_size, blocksparse_head_sliding_step);
+      blocksparse_vert_stride, blocksparse_block_size,                        \
      blocksparse_head_sliding_step);
 #define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
  switch (is_block_sparse) {                                               \
@@ -980,8 +987,8 @@ void paged_attention_v2(
    torch::Tensor& seq_lens,      // [num_seqs]
    int64_t block_size, int64_t max_seq_len,
    const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const std::string& kv_cache_dtype, double k_scale, double v_scale,
-    const int64_t blocksparse_local_blocks,
+    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
    const int64_t blocksparse_head_sliding_step) {
  const bool is_block_sparse = (blocksparse_vert_stride > 1);
--- a/csrc/attention/attention_utils.cuh
+++ b/csrc/attention/attention_utils.cuh
@@ -34,7 +34,7 @@ inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
  A_vec qk_vec = mul<A_vec, Vec, Vec>(q[0], k[0]);
 #pragma unroll
  for (int ii = 1; ii < N; ++ii) {
-    qk_vec = fma(q[ii], k[ii], qk_vec);
+    qk_vec = vllm::fma(q[ii], k[ii], qk_vec);
  }
  // Finalize the reduction across lanes.
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1,2 @@`
							`github: [vllm-project]`
							`open_collective: [vllm]`