[Bugfix] Mistral tool calling when content is list (#18729 )

Signed-off-by: mgoin <mgoin64@gmail.com>
[Core] Automatically cast multi-modal input dtype (#18756 )
2025-05-27 09:05:37 -07:00 · 2025-05-27 23:45:48 +08:00 · 2025-05-27 13:08:44 +00:00 · 2025-05-27 20:06:34 +08:00 · 2025-05-27 09:37:06 +00:00 · 2025-05-27 09:19:18 +00:00
2624 changed files with 528504 additions and 40196 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -0,0 +1,52 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
 import sys
 import zipfile
 # Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
 # Note that we have 400 MiB quota, please use it wisely.
 # See https://github.com/pypi/support/issues/3792 .
 # Please also sync the value with the one in Dockerfile.
 VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
 def print_top_10_largest_files(zip_file):
    """Print the top 10 largest files in the given zip file."""
    with zipfile.ZipFile(zip_file, "r") as z:
        file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
        file_sizes.sort(key=lambda x: x[1], reverse=True)
        for f, size in file_sizes[:10]:
            print(f"{f}: {size / (1024 * 1024):.2f} MBs uncompressed.")
 def check_wheel_size(directory):
    """Check the size of .whl files in the given directory."""
    for root, _, files in os.walk(directory):
        for file_name in files:
            if file_name.endswith(".whl"):
                wheel_path = os.path.join(root, file_name)
                wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
                if wheel_size_mb > VLLM_MAX_SIZE_MB:
                    print(
                        f"Not allowed: Wheel {wheel_path} is larger "
                        f"({wheel_size_mb:.2f} MB) than the limit "
                        f"({VLLM_MAX_SIZE_MB} MB)."
                    )
                    print_top_10_largest_files(wheel_path)
                    return 1
                else:
                    print(
                        f"Wheel {wheel_path} is within the allowed size "
                        f"({wheel_size_mb:.2f} MB)."
                    )
    return 0
 if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python check-wheel-size.py <directory>")
        sys.exit(1)
    directory = sys.argv[1]
    sys.exit(check_wheel_size(directory))
--- a/.buildkite/download-images.sh
+++ b/.buildkite/download-images.sh
@@ -1,18 +0,0 @@
 #!/bin/bash
 set -ex
 set -o pipefail
 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 # aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
 mkdir -p images
 cd images
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
 cd -
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@@ -0,0 +1,26 @@
 # SPDX-License-Identifier: Apache-2.0
 import argparse
 import os
 template = """<!DOCTYPE html>
 <html>
    <body>
    <h1>Links for vLLM</h1/>
        <a href="../{wheel_html_escaped}">{wheel}</a><br/>
    </body>
 </html>
 """
 parser = argparse.ArgumentParser()
 parser.add_argument("--wheel", help="The wheel path.", required=True)
 args = parser.parse_args()
 filename = os.path.basename(args.wheel)
 with open("index.html", "w") as f:
    print(f"Generated index.html for {args.wheel}")
    # cloudfront requires escaping the '+' character
    f.write(
        template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
    )
--- a/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
+++ b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
@@ -0,0 +1,13 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
 model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.671
  - name: "exact_match,flexible-extract"
    value: 0.664
 limit: 1000
 num_fewshot: 5
 trust_remote_code: True
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
@@ -0,0 +1,12 @@
 # For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
 model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.905
  - name: "exact_match,flexible-extract"
    value: 0.905
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
@@ -0,0 +1,12 @@
 # For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
 model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.892
  - name: "exact_match,flexible-extract"
    value: 0.892
 limit: 250
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
@@ -0,0 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.752
  - name: "exact_match,flexible-extract"
    value: 0.754
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
@@ -0,0 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.753
  - name: "exact_match,flexible-extract"
    value: 0.753
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
@@ -0,0 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.755
  - name: "exact_match,flexible-extract"
    value: 0.755
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
@@ -0,0 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
 model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.753
  - name: "exact_match,flexible-extract"
    value: 0.753
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
@@ -0,0 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.764
  - name: "exact_match,flexible-extract"
    value: 0.764
 limit: 250
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
@@ -0,0 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.728
  - name: "exact_match,flexible-extract"
    value: 0.728
 limit: 250
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
@@ -0,0 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.758
  - name: "exact_match,flexible-extract"
    value: 0.759
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
@@ -0,0 +1,12 @@
 # For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
 model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.756
  - name: "exact_match,flexible-extract"
    value: 0.752
 limit: 250
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@@ -0,0 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
 model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.419
  - name: "exact_match,flexible-extract"
    value: 0.416
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml
@@ -0,0 +1,11 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
 model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.335
  - name: "exact_match,flexible-extract"
    value: 0.323
 limit: 1319
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
@@ -0,0 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.356
  - name: "exact_match,flexible-extract"
    value: 0.358
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
@@ -0,0 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
 model_name: "mgoin/Minitron-4B-Base-FP8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.231
  - name: "exact_match,flexible-extract"
    value: 0.22
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
@@ -0,0 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
 model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.86
  - name: "exact_match,flexible-extract"
    value: 0.86
 limit: 250
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
@@ -0,0 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
 model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.624
  - name: "exact_match,flexible-extract"
    value: 0.624
 limit: 250
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
@@ -0,0 +1,12 @@
 # For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
 model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.616
  - name: "exact_match,flexible-extract"
    value: 0.632
 limit: 250
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
@@ -0,0 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
 model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.30
  - name: "exact_match,flexible-extract"
    value: 0.465
 limit: 1319
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
@@ -0,0 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.578
  - name: "exact_match,flexible-extract"
    value: 0.585
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
@@ -0,0 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.593
  - name: "exact_match,flexible-extract"
    value: 0.588
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
@@ -0,0 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
 model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.595
  - name: "exact_match,flexible-extract"
    value: 0.582
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
@@ -0,0 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
 model_name: "Qwen/Qwen2-57B-A14B-Instruct"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.792
  - name: "exact_match,flexible-extract"
    value: 0.824
 limit: 250
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
@@ -0,0 +1,11 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
 model_name: "Qwen/Qwen2.5-1.5B-Instruct"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.54
  - name: "exact_match,flexible-extract"
    value: 0.59
 limit: 1319
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@@ -0,0 +1,11 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
 model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.47
  - name: "exact_match,flexible-extract"
    value: 0.64
 limit: 1319
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
+++ b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
@@ -0,0 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
 model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.6353
  - name: "exact_match,flexible-extract"
    value: 0.637
 limit: null
 num_fewshot: null 
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -0,0 +1,6 @@
 Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
 Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml
 DeepSeek-V2-Lite-Chat.yaml
 Meta-Llama-3-8B-QQQ.yaml
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -0,0 +1,6 @@
 Qwen2.5-1.5B-Instruct.yaml
 Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
 Qwen1.5-MoE-W4A16-compressed-tensors.yaml
--- a/.buildkite/lm-eval-harness/conftest.py
+++ b/.buildkite/lm-eval-harness/conftest.py
@@ -0,0 +1,43 @@
 # SPDX-License-Identifier: Apache-2.0
 from pathlib import Path
 import pytest
 def pytest_addoption(parser):
    parser.addoption(
        "--config-list-file",
        action="store",
        help="Path to the file listing model config YAMLs (one per line)",
    )
    parser.addoption(
        "--tp-size",
        action="store",
        default="1",
        help="Tensor parallel size to use for evaluation",
    )
@pytest.fixture(scope="session")
 def config_list_file(pytestconfig, config_dir):
    rel_path = pytestconfig.getoption("--config-list-file")
    return config_dir / rel_path
@pytest.fixture(scope="session")
 def tp_size(pytestconfig):
    return pytestconfig.getoption("--tp-size")
 def pytest_generate_tests(metafunc):
    if "config_filename" in metafunc.fixturenames:
        rel_path = metafunc.config.getoption("--config-list-file")
        config_list_file = Path(rel_path).resolve()
        config_dir = config_list_file.parent
        with open(config_list_file, encoding="utf-8") as f:
            configs = [
                config_dir / line.strip()
                for line in f
                if line.strip() and not line.startswith("#")
            ]
        metafunc.parametrize("config_filename", configs)
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -0,0 +1,46 @@
 #!/bin/bash
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
 #   pip install lm-eval==0.4.4
 usage() {
    echo``
    echo "Runs lm eval harness on GSM8k using huggingface transformers."
    echo "This pathway is intended to be used to create baselines for "
    echo "our automated nm-test-accuracy workflow"
    echo
    echo "usage: ${0} <options>"
    echo
    echo "  -m    - huggingface stub or local directory of the model"
    echo "  -b    - batch size to run the evaluation at"
    echo "  -l    - limit number of samples to run"
    echo "  -f    - number of fewshot samples to use"
    echo
 }
 while getopts "m:b:l:f:" OPT; do
  case ${OPT} in
    m ) 
        MODEL="$OPTARG"
        ;;
    b ) 
        BATCH_SIZE="$OPTARG"
        ;;
    l ) 
        LIMIT="$OPTARG"
        ;;
    f ) 
        FEWSHOT="$OPTARG"
        ;;
    \? ) 
        usage
        exit 1
        ;;
  esac
 done
 lm_eval --model hf \
  --model_args "pretrained=$MODEL,parallelize=True" \
  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
  --batch_size "$BATCH_SIZE"
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -0,0 +1,51 @@
 #!/bin/bash
 # We can use this script to compute baseline accuracy on GSM for vllm.
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
 #   pip install lm-eval==0.4.4
 usage() {
    echo``
    echo "Runs lm eval harness on GSM8k using huggingface transformers."
    echo "This pathway is intended to be used to create baselines for "
    echo "our automated nm-test-accuracy workflow"
    echo
    echo "usage: ${0} <options>"
    echo
    echo "  -m    - huggingface stub or local directory of the model"
    echo "  -b    - batch size to run the evaluation at"
    echo "  -l    - limit number of samples to run"
    echo "  -f    - number of fewshot samples to use"
    echo "  -t    - tensor parallel size to run at"
    echo
 }
 while getopts "m:b:l:f:t:" OPT; do
  case ${OPT} in
    m ) 
        MODEL="$OPTARG"
        ;;
    b ) 
        BATCH_SIZE="$OPTARG"
        ;;
    l ) 
        LIMIT="$OPTARG"
        ;;
    f ) 
        FEWSHOT="$OPTARG"
        ;;
    t )
        TP_SIZE="$OPTARG"
        ;;
    \? ) 
        usage
        exit 1
        ;;
  esac
 done
 lm_eval --model vllm \
  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
  --batch_size "$BATCH_SIZE"
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -0,0 +1,54 @@
 # SPDX-License-Identifier: Apache-2.0
 """
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml
 pytest -s -v test_lm_eval_correctness.py \
    --config-list-file=configs/models-small.txt \
    --tp-size=1
 """
 import lm_eval
 import numpy as np
 import yaml
 RTOL = 0.08
 def launch_lm_eval(eval_config, tp_size):
    trust_remote_code = eval_config.get("trust_remote_code", False)
    model_args = (
        f"pretrained={eval_config['model_name']},"
        f"tensor_parallel_size={tp_size},"
        f"enforce_eager=true,"
        f"add_bos_token=true,"
        f"trust_remote_code={trust_remote_code}"
    )
    results = lm_eval.simple_evaluate(
        model="vllm",
        model_args=model_args,
        tasks=[task["name"] for task in eval_config["tasks"]],
        num_fewshot=eval_config["num_fewshot"],
        limit=eval_config["limit"],
        batch_size="auto",
    )
    return results
 def test_lm_eval_correctness_param(config_filename, tp_size):
    eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
    results = launch_lm_eval(eval_config, tp_size)
    success = True
    for task in eval_config["tasks"]:
        for metric in task["metrics"]:
            ground_truth = metric["value"]
            measured_value = results["results"][task["name"]][metric["name"]]
            print(
                f"{task['name']} | {metric['name']}: "
                f"ground_truth={ground_truth} | measured={measured_value}"
            )
            success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
    assert success
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -0,0 +1,143 @@
 # vLLM benchmark suite
 ## Introduction
 This directory contains two sets of benchmark for vllm.
 - Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
 - Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
 See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
 ## Performance benchmark quick overview
 **Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
 **Benchmarking Duration**: about 1hr.
 **For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
 ## Nightly benchmark quick overview
 **Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
 **Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
 **Benchmarking Duration**: about 3.5hrs.
 ## Trigger the benchmark
 Performance benchmark will be triggered when:
 - A PR being merged into vllm.
 - Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
 Nightly benchmark will be triggered when:
 - Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
 ## Performance benchmark details
 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 ### Latency test
 Here is an example of one test inside `latency-tests.json`:
 ```json
 [
    {
        "test_name": "latency_llama8B_tp1",
        "parameters": {
            "model": "meta-llama/Meta-Llama-3-8B",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "num_iters_warmup": 5,
            "num_iters": 15
        }
    },
 ]
 ```
 In this example:
 - The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
 - The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
 Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
 WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
 ### Throughput test
 The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
 The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
 ### Serving test
 We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
 ```json
 [
    {
        "test_name": "serving_llama8B_tp1_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B",
            "tensor_parallel_size": 1,
            "swap_space": 16,
            "disable_log_stats": "",
            "disable_log_requests": "",
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
 ]
 ```
 Inside this example:
 - The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
 - The `server-parameters` includes the command line arguments for vLLM server.
 - The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
 - The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
 The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
 WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
 ### Visualizing the results
 The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
 You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
 If you do not see the table, please wait till the benchmark finish running.
 The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
 The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
 ## Nightly test details
 See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
 ### Workflow
 - The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
 - Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
 - The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
 - At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
 ### Nightly tests
 In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
 ### Docker containers
 The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
 WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
 WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -0,0 +1,184 @@
 steps:
  - label: "Wait for container to be ready"
    key: wait-for-container-image
    agents:
      queue: A100
    plugins:
    - kubernetes:
        podSpec:
          containers:
          - image: badouralix/curl-jq
            command:
            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
  - label: "Cleanup H100"
    agents:
      queue: H100
    depends_on: ~
    command: docker system prune -a --volumes --force
  - label: "A100"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: A100
    depends_on: wait-for-container-image
    if: build.branch == "main"
    plugins:
    - kubernetes:
        podSpec:
          priorityClassName: perf-benchmark
          containers:
          - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
            command:
            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
            resources:
              limits:
                nvidia.com/gpu: 8
            volumeMounts:
            - name: devshm
              mountPath: /dev/shm
            env:
            - name: VLLM_USAGE_SOURCE
              value: ci-test
            - name: HF_TOKEN
              valueFrom:
                secretKeyRef:
                  name: hf-token-secret
                  key: token
          nodeSelector:
            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
          volumes:
          - name: devshm
            emptyDir:
              medium: Memory
  - label: "H200"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: H200
    depends_on: wait-for-container-image
    if: build.branch == "main"
    plugins:
    - docker#v5.12.0:
        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
        command:
        - bash
        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
        mount-buildkite-agent: true
        propagate-environment: true
        ipc: host
        gpus: 4,5,6,7
        volumes:
          - /data/benchmark-hf-cache:/root/.cache/huggingface
        environment:
        - VLLM_USAGE_SOURCE
        - HF_TOKEN
  #- block: "Run H100 Benchmark"
    #key: block-h100
    #depends_on: ~
  - label: "H100"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: H100
    depends_on: wait-for-container-image
    if: build.branch == "main"
    plugins:
    - docker#v5.12.0:
        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
        command:
        - bash
        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
        mount-buildkite-agent: true
        propagate-environment: true
        ipc: host
        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
        volumes:
          - /data/benchmark-hf-cache:/root/.cache/huggingface
        environment:
        - VLLM_USAGE_SOURCE
        - HF_TOKEN
  # Premerge benchmark
  - label: "A100"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: A100
    depends_on: wait-for-container-image
    if: build.branch != "main"
    plugins:
    - kubernetes:
        podSpec:
          priorityClassName: perf-benchmark
          containers:
          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
            command:
            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
            resources:
              limits:
                nvidia.com/gpu: 8
            volumeMounts:
            - name: devshm
              mountPath: /dev/shm
            env:
            - name: VLLM_USAGE_SOURCE
              value: ci-test
            - name: HF_TOKEN
              valueFrom:
                secretKeyRef:
                  name: hf-token-secret
                  key: token
          nodeSelector:
            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
          volumes:
          - name: devshm
            emptyDir:
              medium: Memory
  - label: "H200"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: H200
    depends_on: wait-for-container-image
    if: build.branch != "main"
    plugins:
    - docker#v5.12.0:
        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
        command:
        - bash
        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
        mount-buildkite-agent: true
        propagate-environment: true
        ipc: host
        gpus: 4,5,6,7
        volumes:
          - /data/benchmark-hf-cache:/root/.cache/huggingface
        environment:
        - VLLM_USAGE_SOURCE
        - HF_TOKEN
  #- block: "Run H100 Benchmark"
    #key: block-h100
    #depends_on: ~
  - label: "H100"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
    agents:
      queue: H100
    depends_on: wait-for-container-image
    if: build.branch != "main"
    plugins:
    - docker#v5.12.0:
        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
        command:
        - bash
        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
        mount-buildkite-agent: true
        propagate-environment: true
        ipc: host
        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
        volumes:
          - /data/benchmark-hf-cache:/root/.cache/huggingface
        environment:
        - VLLM_USAGE_SOURCE
        - HF_TOKEN
--- a/.buildkite/nightly-benchmarks/nightly-annotation.md
+++ b/.buildkite/nightly-benchmarks/nightly-annotation.md
@@ -0,0 +1,27 @@
 ## Description
 This file contains the downloading link for benchmarking results.
 - [benchmarking pipeline](artifact://nightly-pipeline.yaml)
 - [benchmarking results](artifact://results.zip)
 - [benchmarking code](artifact://nightly-benchmarks.zip)
 Please download the visualization scripts in the post
 ## Results reproduction
 - Find the docker we use in `benchmarking pipeline`
 - Deploy the docker, and inside the docker:
  - Download `nightly-benchmarks.zip`.
  - In the same folder, run the following code:
  ```console
  export HF_TOKEN=<your HF token>
  apt update
  apt install -y git
  unzip nightly-benchmarks.zip
  VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
  ```
 And the results will be inside `./benchmarks/results`.
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -0,0 +1,39 @@
 # Nightly benchmark
 This benchmark aims to:
 - Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
 - Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
 Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
 Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
 ## Setup
 - Docker images:
  - vLLM: `vllm/vllm-openai:v0.6.2`
  - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
  - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
  - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
    - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
  - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
 - Hardware
  - 8x Nvidia A100 GPUs
 - Workload:
  - Dataset
    - ShareGPT dataset
    - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
    - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
    - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
  - Models: llama-3 8B, llama-3 70B.
    - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
  - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
    - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
  - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
 ## Known issues
 - TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
 - TGI does not support `ignore-eos` flag.
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -0,0 +1,196 @@
 common_pod_spec: &common_pod_spec
  priorityClassName: perf-benchmark
  nodeSelector:
    nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
  volumes:
    - name: devshm
      emptyDir:
        medium: Memory
    - name: hf-cache
      hostPath:
        path: /root/.cache/huggingface
        type: Directory
 common_container_settings: &common_container_settings
  command:
    - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
  resources:
    limits:
      nvidia.com/gpu: 8
  volumeMounts:
    - name: devshm
      mountPath: /dev/shm
    - name: hf-cache
      mountPath: /root/.cache/huggingface
  env:
    - name: VLLM_USAGE_SOURCE
      value: ci-test
    - name: HF_HOME
      value: /root/.cache/huggingface
    - name: VLLM_SOURCE_CODE_LOC
      value: /workspace/build/buildkite/vllm/performance-benchmark
    - name: HF_TOKEN
      valueFrom:
        secretKeyRef:
          name: hf-token-secret
          key: token
 steps:
  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
  - label: "A100 vllm step 10"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: vllm/vllm-openai:v0.6.2
                <<: *common_container_settings
  - label: "A100 sglang benchmark"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: lmsysorg/sglang:v0.3.2-cu121
                <<: *common_container_settings
  - label: "A100 lmdeploy benchmark"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: openmmlab/lmdeploy:v0.6.1-cu12
                <<: *common_container_settings
  - label: "A100 trt llama-8B"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
                <<: *common_container_settings
                env:
                  - name: VLLM_USAGE_SOURCE
                    value: ci-test
                  - name: HF_HOME
                    value: /root/.cache/huggingface
                  - name: VLLM_SOURCE_CODE_LOC
                    value: /workspace/build/buildkite/vllm/performance-benchmark
                  - name: HF_TOKEN
                    valueFrom:
                      secretKeyRef:
                        name: hf-token-secret
                        key: token
                  - name: TEST_SELECTOR
                    value: "llama8B"
  - label: "A100 trt llama-70B"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
                <<: *common_container_settings
                env:
                  - name: VLLM_USAGE_SOURCE
                    value: ci-test
                  - name: HF_HOME
                    value: /root/.cache/huggingface
                  - name: VLLM_SOURCE_CODE_LOC
                    value: /workspace/build/buildkite/vllm/performance-benchmark
                  - name: HF_TOKEN
                    valueFrom:
                      secretKeyRef:
                        name: hf-token-secret
                        key: token
                  - name: TEST_SELECTOR
                    value: "llama70B"
  # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image 
  # - label: "A100 trt benchmark"
  #   priority: 100
  #   agents:
  #     queue: A100
  #   plugins:
  #     - kubernetes:
  #         podSpec:
  #           <<: *common_pod_spec
  #           containers:
  #             - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
  #               <<: *common_container_settings
  # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
  # - label: "A100 tgi benchmark"
  #   priority: 100
  #   agents:
  #     queue: A100
  #   plugins:
  #     - kubernetes:
  #         podSpec:
  #           <<: *common_pod_spec
  #           containers:
  #             - image: ghcr.io/huggingface/text-generation-inference:2.2.0
  #               <<: *common_container_settings
  - wait
  - label: "Collect the results"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
            - image: vllm/vllm-openai:v0.5.0.post1
              command:
              - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
              resources:
                limits:
                  nvidia.com/gpu: 8
              volumeMounts:
              - name: devshm
                mountPath: /dev/shm
              env:
              - name: VLLM_USAGE_SOURCE
                value: ci-test
              - name: VLLM_SOURCE_CODE_LOC
                value: /workspace/build/buildkite/vllm/performance-benchmark
              - name: HF_TOKEN
                valueFrom:
                  secretKeyRef:
                    name: hf-token-secret
                    key: token
  - block: ":rocket: check the results!"
--- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
@@ -0,0 +1,56 @@
 ## Latency tests
 - Input length: 32 tokens.
 - Output length: 128 tokens.
 - Batch size: fixed (8).
 - Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: end-to-end latency (mean, median, p99).
 {latency_tests_markdown_table}
 ## Throughput tests
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm to achieve maximum throughput.
 - Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: throughput.
 {throughput_tests_markdown_table}
 ## Serving tests
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
 - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
 - Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - We also added a speculative decoding test for llama-3 70B, under QPS 2
 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
 {serving_tests_markdown_table}
 ## json version of the benchmarking tables
 This section contains the data of the markdown tables above in JSON format.
 You can load the benchmarking tables into pandas dataframes as follows:
 ```python
 import json
 import pandas as pd
 benchmarking_results_json = """The json string"""
 benchmarking_results = json.loads(benchmarking_results_json)
 latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"])
 throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"])
 serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
 ```
 The json string for all benchmarking tables:
 ```json
 {benchmarking_results_in_json_string}
 ```
 You can also check the raw experiment data in the Artifact tab of the Buildkite page.
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -0,0 +1,224 @@
 # SPDX-License-Identifier: Apache-2.0
 import json
 import os
 from pathlib import Path
 import pandas as pd
 from tabulate import tabulate
 results_folder = Path("results/")
 # latency results and the keys that will be printed into markdown
 latency_results = []
 latency_column_mapping = {
    "test_name": "Test name",
    "gpu_type": "GPU",
    "avg_latency": "Mean latency (ms)",
    # "P10": "P10 (s)",
    # "P25": "P25 (s)",
    "P50": "Median latency (ms)",
    # "P75": "P75 (s)",
    # "P90": "P90 (s)",
    "P99": "P99 latency (ms)",
 }
 # throughput tests and the keys that will be printed into markdown
 throughput_results = []
 throughput_results_column_mapping = {
    "test_name": "Test name",
    "gpu_type": "GPU",
    # "num_requests": "# of req.",
    # "total_num_tokens": "Total # of tokens",
    # "elapsed_time": "Elapsed time (s)",
    "requests_per_second": "Tput (req/s)",
    # "tokens_per_second": "Tput (tok/s)",
 }
 # serving results and the keys that will be printed into markdown
 serving_results = []
 serving_column_mapping = {
    "test_name": "Test name",
    "gpu_type": "GPU",
    # "completed": "# of req.",
    "request_throughput": "Tput (req/s)",
    # "input_throughput": "Input Tput (tok/s)",
    # "output_throughput": "Output Tput (tok/s)",
    "mean_ttft_ms": "Mean TTFT (ms)",
    "median_ttft_ms": "Median TTFT (ms)",
    "p99_ttft_ms": "P99 TTFT (ms)",
    # "mean_tpot_ms": "Mean TPOT (ms)",
    # "median_tpot_ms": "Median",
    # "p99_tpot_ms": "P99",
    "mean_itl_ms": "Mean ITL (ms)",
    "median_itl_ms": "Median ITL (ms)",
    "p99_itl_ms": "P99 ITL (ms)",
 }
 def read_markdown(file):
    if os.path.exists(file):
        with open(file) as f:
            return f.read() + "\n"
    else:
        return f"{file} not found.\n"
 def results_to_json(latency, throughput, serving):
    return json.dumps(
        {
            "latency": latency.to_dict(),
            "throughput": throughput.to_dict(),
            "serving": serving.to_dict(),
        }
    )
 if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):
        with open(test_file) as f:
            raw_result = json.loads(f.read())
        if "serving" in str(test_file):
            # this result is generated via `benchmark_serving.py`
            # attach the benchmarking command to raw_result
            try:
                with open(test_file.with_suffix(".commands")) as f:
                    command = json.loads(f.read())
            except OSError as e:
                print(e)
                continue
            raw_result.update(command)
            # update the test name of this result
            raw_result.update({"test_name": test_file.stem})
            # add the result to raw_result
            serving_results.append(raw_result)
            continue
        elif "latency" in f.name:
            # this result is generated via `benchmark_latency.py`
            # attach the benchmarking command to raw_result
            try:
                with open(test_file.with_suffix(".commands")) as f:
                    command = json.loads(f.read())
            except OSError as e:
                print(e)
                continue
            raw_result.update(command)
            # update the test name of this result
            raw_result.update({"test_name": test_file.stem})
            # get different percentiles
            for perc in [10, 25, 50, 75, 90, 99]:
                # Multiply 1000 to convert the time unit from s to ms
                raw_result.update(
                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
                )
            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
            # add the result to raw_result
            latency_results.append(raw_result)
            continue
        elif "throughput" in f.name:
            # this result is generated via `benchmark_throughput.py`
            # attach the benchmarking command to raw_result
            try:
                with open(test_file.with_suffix(".commands")) as f:
                    command = json.loads(f.read())
            except OSError as e:
                print(e)
                continue
            raw_result.update(command)
            # update the test name of this result
            raw_result.update({"test_name": test_file.stem})
            # add the result to raw_result
            throughput_results.append(raw_result)
            continue
        print(f"Skipping {test_file}")
    latency_results = pd.DataFrame.from_dict(latency_results)
    serving_results = pd.DataFrame.from_dict(serving_results)
    throughput_results = pd.DataFrame.from_dict(throughput_results)
    raw_results_json = results_to_json(
        latency_results, throughput_results, serving_results
    )
    # remapping the key, for visualization purpose
    if not latency_results.empty:
        latency_results = latency_results[list(latency_column_mapping.keys())].rename(
            columns=latency_column_mapping
        )
    if not serving_results.empty:
        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
            columns=serving_column_mapping
        )
    if not throughput_results.empty:
        throughput_results = throughput_results[
            list(throughput_results_column_mapping.keys())
        ].rename(columns=throughput_results_column_mapping)
    processed_results_json = results_to_json(
        latency_results, throughput_results, serving_results
    )
    for df in [latency_results, serving_results, throughput_results]:
        if df.empty:
            continue
        # Sort all dataframes by their respective "Test name" columns
        df.sort_values(by="Test name", inplace=True)
        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
        # we want to turn it into "8xGPUTYPE"
        df["GPU"] = df["GPU"].apply(
            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
        )
    # get markdown tables
    latency_md_table = tabulate(
        latency_results, headers="keys", tablefmt="pipe", showindex=False
    )
    serving_md_table = tabulate(
        serving_results, headers="keys", tablefmt="pipe", showindex=False
    )
    throughput_md_table = tabulate(
        throughput_results, headers="keys", tablefmt="pipe", showindex=False
    )
    # document the result
    with open(results_folder / "benchmark_results.md", "w") as f:
        results = read_markdown(
            "../.buildkite/nightly-benchmarks/"
            + "performance-benchmarks-descriptions.md"
        )
        results = results.format(
            latency_tests_markdown_table=latency_md_table,
            throughput_tests_markdown_table=throughput_md_table,
            serving_tests_markdown_table=serving_md_table,
            benchmarking_results_in_json_string=processed_results_json,
        )
        f.write(results)
    # document benchmarking results in json
    with open(results_folder / "benchmark_results.json", "w") as f:
        results = (
            latency_results.to_dict(orient="records")
            + throughput_results.to_dict(orient="records")
            + serving_results.to_dict(orient="records")
        )
        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@@ -0,0 +1,25 @@
 # SPDX-License-Identifier: Apache-2.0
 import argparse
 from transformers import AutoTokenizer
 def main(model, cachedir):
    # Load the tokenizer and save it to the specified directory
    tokenizer = AutoTokenizer.from_pretrained(model)
    tokenizer.save_pretrained(cachedir)
    print(f"Tokenizer saved to {cachedir}")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Download and save Hugging Face tokenizer"
    )
    parser.add_argument("--model", type=str, required=True, help="Name of the model")
    parser.add_argument(
        "--cachedir", type=str, required=True, help="Directory to save the tokenizer"
    )
    args = parser.parse_args()
    main(args.model, args.cachedir)
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@@ -0,0 +1,96 @@
 # SPDX-License-Identifier: Apache-2.0
 import argparse
 import json
 from pathlib import Path
 import numpy as np
 import pandas as pd
 from tabulate import tabulate
 def parse_arguments():
    parser = argparse.ArgumentParser(
        description="Parse command line arguments for summary-nightly-results script."
    )
    parser.add_argument(
        "--results-folder",
        type=str,
        required=True,
        help="The folder where the results are stored.",
    )
    parser.add_argument(
        "--description", type=str, required=True, help="Description of the results."
    )
    args = parser.parse_args()
    return args
 def get_perf(df, method, model, metric):
    means = []
    for qps in [2, 4, 8, 16, "inf"]:
        target = df["Test name"].str.contains(model)
        target = target & df["Engine"].str.contains(method)
        target = target & df["Test name"].str.contains("qps_" + str(qps))
        filtered_df = df[target]
        if filtered_df.empty:
            means.append(0.0)
        else:
            means.append(filtered_df[metric].values[0])
    return np.array(means)
 def get_perf_w_std(df, method, model, metric):
    if metric in ["TTFT", "ITL"]:
        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
        mean = mean.tolist()
        std = get_perf(df, method, model, "Std " + metric + " (ms)")
        if std.mean() == 0:
            std = None
        success = get_perf(df, method, model, "Successful req.")
        if std is not None:
            std = std / np.sqrt(success)
            std = std.tolist()
    else:
        assert metric == "Tput"
        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
            df, method, model, "Output Tput (tok/s)"
        )
        mean = mean.tolist()
        std = None
    return mean, std
 def main(args):
    results_folder = Path(args.results_folder)
    results = []
    # collect results
    for test_file in results_folder.glob("*_nightly_results.json"):
        with open(test_file) as f:
            results = results + json.loads(f.read())
    # generate markdown table
    df = pd.DataFrame.from_dict(results)
    md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
    with open(args.description) as f:
        description = f.read()
    description = description.format(nightly_results_benchmarking_table=md_table)
    with open("nightly_results.md", "w") as f:
        f.write(description)
 if __name__ == "__main__":
    args = parse_arguments()
    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@@ -0,0 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 from lmdeploy.serve.openai.api_client import APIClient
 api_client = APIClient("http://localhost:8000")
 model_name = api_client.available_models[0]
 print(model_name)
--- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
@@ -0,0 +1,228 @@
 #!/bin/bash
 # Currently FP8 benchmark is NOT enabled.
 set -x
 server_params=$1
 common_params=$2
 json2args() {
  # transforms the JSON string to command line args, and '_' is replaced to '-'
  # example:
  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
  local json_string=$1
  local args=$(
    echo "$json_string" | jq -r '
      to_entries |
      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
      join(" ")
    '
  )
  echo "$args"
 }
 launch_trt_server() {
  model_path=$(echo "$common_params" | jq -r '.model')
  model_name="${model_path#*/}"
  model_type=$(echo "$server_params" | jq -r '.model_type')
  model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
  model_tp_size=$(echo "$common_params" | jq -r '.tp')
  max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
  max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
  max_seq_len=$(echo "$server_params" | jq -r '.max_seq_len')
  max_num_tokens=$(echo "$server_params" | jq -r '.max_num_tokens')
  trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
  # create model caching directory
  cd ~
  rm -rf models
  mkdir -p models
  cd models
  models_dir=$(pwd)
  trt_model_path=${models_dir}/${model_name}-trt-ckpt
  trt_engine_path=${models_dir}/${model_name}-trt-engine
  # clone tensorrt backend
  cd /
  rm -rf tensorrtllm_backend
  git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
  git lfs install
  cd tensorrtllm_backend
  git checkout "$trt_llm_version"
  git submodule update --init --recursive
  # build trtllm engine
  cd /tensorrtllm_backend
  cd "./tensorrt_llm/examples/${model_type}"
  python3 convert_checkpoint.py \
    --model_dir "${model_path}" \
    --dtype "${model_dtype}" \
    --tp_size "${model_tp_size}" \
    --output_dir "${trt_model_path}"
  trtllm-build \
    --checkpoint_dir "${trt_model_path}" \
    --use_fused_mlp \
    --reduce_fusion disable \
    --workers 8 \
    --gpt_attention_plugin "${model_dtype}" \
    --gemm_plugin "${model_dtype}" \
    --tp_size "${model_tp_size}" \
    --max_batch_size "${max_batch_size}" \
    --max_input_len "${max_input_len}" \
    --max_seq_len "${max_seq_len}" \
    --max_num_tokens "${max_num_tokens}" \
    --output_dir "${trt_engine_path}"
  # handle triton protobuf files and launch triton server
  cd /tensorrtllm_backend
  mkdir triton_model_repo
  cp -r all_models/inflight_batcher_llm/* triton_model_repo/
  cd triton_model_repo
  rm -rf ./tensorrt_llm/1/*
  cp -r "${trt_engine_path}"/* ./tensorrt_llm/1
  python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
  python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5"
  python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false"
  python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size"
  python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1"
  cd /tensorrtllm_backend
  python3 scripts/launch_triton_server.py \
    --world_size="${model_tp_size}" \
    --model_repo=/tensorrtllm_backend/triton_model_repo &
 }
 launch_tgi_server() {
  model=$(echo "$common_params" | jq -r '.model')
  tp=$(echo "$common_params" | jq -r '.tp')
  port=$(echo "$common_params" | jq -r '.port')
  server_args=$(json2args "$server_params")
  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
    echo "Key 'fp8' exists in common params."
    server_command="/tgi-entrypoint.sh \
                --model-id $model \
                --num-shard $tp \
                --port $port \
                --quantize fp8 \
                $server_args"
  else
    echo "Key 'fp8' does not exist in common params."
    server_command="/tgi-entrypoint.sh \
                --model-id $model \
                --num-shard $tp \
                --port $port \
                $server_args"
  fi
  echo "Server command: $server_command"
  eval "$server_command" &
 }
 launch_lmdeploy_server() {
  model=$(echo "$common_params" | jq -r '.model')
  tp=$(echo "$common_params" | jq -r '.tp')
  port=$(echo "$common_params" | jq -r '.port')
  server_args=$(json2args "$server_params")
  server_command="lmdeploy serve api_server $model \
    --tp $tp \
    --server-port $port \
    $server_args"
  # run the server
  echo "Server command: $server_command"
  bash -c "$server_command" &
 }
 launch_sglang_server() {
  model=$(echo "$common_params" | jq -r '.model')
  tp=$(echo "$common_params" | jq -r '.tp')
  port=$(echo "$common_params" | jq -r '.port')
  server_args=$(json2args "$server_params")
  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
    server_command="python3 \
        -m sglang.launch_server \
        --tp $tp \
        --model-path $model \
        --port $port \
        $server_args"
  else
    echo "Key 'fp8' does not exist in common params."
    server_command="python3 \
        -m sglang.launch_server \
        --tp $tp \
        --model-path $model \
        --port $port \
        $server_args"
  fi
  # run the server
  echo "Server command: $server_command"
  eval "$server_command" &
 }
 launch_vllm_server() {
  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
  model=$(echo "$common_params" | jq -r '.model')
  tp=$(echo "$common_params" | jq -r '.tp')
  port=$(echo "$common_params" | jq -r '.port')
  server_args=$(json2args "$server_params")
  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
    server_command="python3 \
        -m vllm.entrypoints.openai.api_server \
        -tp $tp \
        --model $model \
        --port $port \
        $server_args"
  else
    echo "Key 'fp8' does not exist in common params."
    server_command="python3 \
        -m vllm.entrypoints.openai.api_server \
        -tp $tp \
        --model $model \
        --port $port \
        $server_args"
  fi
  # run the server
  echo "Server command: $server_command"
  eval "$server_command" &
 }
 main() {
  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then
    launch_trt_server
  fi
  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then
    launch_tgi_server
  fi
  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
    launch_lmdeploy_server
  fi
  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then
    launch_sglang_server
  fi
  if [[ "$CURRENT_LLM_SERVING_ENGINE" == *"vllm"* ]]; then
    launch_vllm_server
  fi
 }
 main
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -0,0 +1,78 @@
 #!/bin/bash
 set -ex
 set -o pipefail
 main() {
    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
    (which jq) || (apt-get update && apt-get -y install jq)
    (which zip) || (apt-get install -y zip)
    if [ ! -f /workspace/buildkite-agent ]; then
        echo "buildkite-agent binary not found. Skip plotting the results."
        exit 0
    fi
    # initial annotation
    #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
    # download results
    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
    mkdir -p results/
    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
    ls
    ls results/
    # upload benchmark results
    zip -r results.zip results/
    /workspace/buildkite-agent artifact upload "results.zip"
    # upload benchmarking scripts
    cd "$VLLM_SOURCE_CODE_LOC/"
    zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
    /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
    # upload benchmarking pipeline
    /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
    # The figures should be generated by a separate process outside the CI/CD pipeline
    # # generate figures
    # python3 -m pip install tabulate pandas matplotlib
    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
    #     --description $description \
    #     --results-folder results/ 
    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
    #     --description $description \
    #     --results-folder results/ \
    #     --dataset sharegpt
    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
    #     --description $description \
    #     --results-folder results/ \
    #     --dataset sonnet_2048_128
    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
    #     --description $description \
    #     --results-folder results/ \
    #     --dataset sonnet_128_2048
    # # upload results and figures
    # /workspace/buildkite-agent artifact upload "nightly_results*.png"
    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
    # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
 }
 main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -0,0 +1,462 @@
 #!/bin/bash
 set -o pipefail
 set -x
 check_gpus() {
  # check the number of GPUs and GPU type.
  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  if [[ $gpu_count -gt 0 ]]; then
    echo "GPU found."
  else
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
  declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
  echo "GPU type is $gpu_type"
 }
 check_hf_token() {
  # check if HF_TOKEN is available and valid
  if [[ -z "$HF_TOKEN" ]]; then
    echo "Error: HF_TOKEN is not set."
    exit 1
  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
    echo "Error: HF_TOKEN does not start with 'hf_'."
    exit 1
  else
    echo "HF_TOKEN is set and valid."
  fi
 }
 upload_to_buildkite() {
  # upload the benchmarking results to buildkite
  # if the agent binary is not found, skip uploading the results, exit 0
  if [ ! -f /workspace/buildkite-agent ]; then
    echo "buildkite-agent binary not found. Skip uploading the results."
    return 0
  fi
  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 get_current_llm_serving_engine() {
  if which lmdeploy >/dev/null; then
    echo "Container: lmdeploy"
    export CURRENT_LLM_SERVING_ENGINE=lmdeploy
    return
  fi
  if [ -e /tgi-entrypoint.sh ]; then
    echo "Container: tgi"
    export CURRENT_LLM_SERVING_ENGINE=tgi
    return
  fi
  if which trtllm-build >/dev/null; then
    echo "Container: tensorrt-llm"
    export CURRENT_LLM_SERVING_ENGINE=trt
    return
  fi
  if [ -e /sgl-workspace ]; then
    echo "Container: sglang"
    export CURRENT_LLM_SERVING_ENGINE=sglang
    return
  fi
  if [ -e /vllm-workspace ]; then
    echo "Container: vllm"
    # move to a completely irrelevant directory, to avoid import vllm from current folder
    export CURRENT_LLM_SERVING_ENGINE=vllm
    return
  fi
 }
 json2args() {
  # transforms the JSON string to command line args, and '_' is replaced to '-'
  # example:
  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
  local json_string=$1
  local args=$(
    echo "$json_string" | jq -r '
      to_entries |
      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
      join(" ")
    '
  )
  echo "$args"
 }
 kill_gpu_processes() {
  pkill -f python
  pkill -f python3
  pkill -f tritonserver
  pkill -f pt_main_thread
  pkill -f text-generation
  pkill -f lmdeploy
  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
    sleep 1
  done
 }
 wait_for_server() {
  # wait for vllm server to start
  # return 1 if vllm server crashes
  timeout 1200 bash -c '
    until curl -s localhost:8000/v1/completions > /dev/null; do
      sleep 1
    done' && return 0 || return 1
 }
 ensure_installed() {
  # Ensure that the given command is installed by apt-get
  local cmd=$1
  if ! which "$cmd" >/dev/null; then
    apt-get update && apt-get install -y "$cmd"
  fi
 }
 run_serving_tests() {
  # run serving tests using `benchmark_serving.py`
  # $1: a json file specifying serving test cases
  local serving_test_file
  serving_test_file=$1
  # Iterate over serving tests
  jq -c '.[]' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # prepend the current serving engine to the test name
    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
    # get common parameters
    common_params=$(echo "$params" | jq -r '.common_parameters')
    model=$(echo "$common_params" | jq -r '.model')
    tp=$(echo "$common_params" | jq -r '.tp')
    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
    port=$(echo "$common_params" | jq -r '.port')
    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
    # get client and server arguments
    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
    client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
    client_args=$(json2args "$client_params")
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
    # check if there is enough GPU to run the test
    if [[ $gpu_count -lt $tp ]]; then
      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi
    if [[ $reuse_server == "true" ]]; then
      echo "Reuse previous server for test case $test_name"
    else
      kill_gpu_processes
      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
        "$server_params" "$common_params"
    fi
    if wait_for_server; then
      echo ""
      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
    else
      echo ""
      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
      break
    fi
    # prepare tokenizer
    # this is required for lmdeploy.
    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
    rm -rf /tokenizer_cache
    mkdir /tokenizer_cache
    python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
      --model "$model" \
      --cachedir /tokenizer_cache
    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
    # change model name for lmdeploy (it will not follow standard hf name)
    if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
      model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
    fi
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
        echo "qps was $qps"
        qps="inf"
        echo "now qps is $qps"
      fi
      new_test_name=$test_name"_qps_"$qps
      backend=$CURRENT_LLM_SERVING_ENGINE
      if [[ $backend = "trt" ]]; then
        backend="tensorrt-llm"
      fi
      if [[ "$backend" == *"vllm"* ]]; then
        backend="vllm"
      fi
      if [[ "$dataset_name" = "sharegpt" ]]; then
        client_command="python3 benchmark_serving.py \
          --backend $backend \
          --tokenizer /tokenizer_cache \
          --model $model \
          --dataset-name $dataset_name \
          --dataset-path $dataset_path \
          --num-prompts $num_prompts \
          --port $port \
          --save-result \
          --result-dir $RESULTS_FOLDER \
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --ignore-eos \
          $client_args"
      elif [[ "$dataset_name" = "sonnet" ]]; then
        sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
        sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
        sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
        client_command="python3 benchmark_serving.py \
          --backend $backend \
          --tokenizer /tokenizer_cache \
          --model $model \
          --dataset-name $dataset_name \
          --dataset-path $dataset_path \
          --num-prompts $num_prompts \
          --sonnet-input-len $sonnet_input_len \
          --sonnet-output-len $sonnet_output_len \
          --sonnet-prefix-len $sonnet_prefix_len \
          --port $port \
          --save-result \
          --result-dir $RESULTS_FOLDER \
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --ignore-eos \
          $client_args"
      else
        echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
        exit 1
      fi
      echo "Running test case $test_name with qps $qps"
      echo "Client command: $client_command"
      eval "$client_command"
      server_command="None"
      # record the benchmarking commands
      jq_output=$(jq -n \
        --arg server "$server_command" \
        --arg client "$client_command" \
        --arg gpu "$gpu_type" \
        --arg engine "$CURRENT_LLM_SERVING_ENGINE" \
        '{
          server_command: $server,
          client_command: $client,
          gpu_type: $gpu,
          engine: $engine
        }')
      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
    done
  done
  kill_gpu_processes
 }
 run_genai_perf_tests() {
  # run genai-perf tests 
  # $1: a json file specifying genai-perf test cases
  local genai_perf_test_file
  genai_perf_test_file=$1
  # Iterate over genai-perf tests
  jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')    
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # prepend the current serving engine to the test name
    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
    # get common parameters
    common_params=$(echo "$params" | jq -r '.common_parameters')
    model=$(echo "$common_params" | jq -r '.model')
    tp=$(echo "$common_params" | jq -r '.tp')
    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
    port=$(echo "$common_params" | jq -r '.port')
    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
    # get client and server arguments
    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
    # check if there is enough GPU to run the test
    if [[ $gpu_count -lt $tp ]]; then
      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi
    if [[ $reuse_server == "true" ]]; then
      echo "Reuse previous server for test case $test_name"
    else
      kill_gpu_processes
      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
        "$server_params" "$common_params"
    fi
    if wait_for_server; then
      echo ""
      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
    else
      echo ""
      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
      break
    fi
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
        echo "qps was $qps"
        qps=$num_prompts
        echo "now qps is $qps"
      fi
      new_test_name=$test_name"_qps_"$qps
      backend=$CURRENT_LLM_SERVING_ENGINE
      if [[ "$backend" == *"vllm"* ]]; then
        backend="vllm"
      fi
      #TODO: add output dir.
      client_command="genai-perf profile \
        -m $model \
        --service-kind openai \
        --backend vllm \
        --endpoint-type chat \
        --streaming \
        --url localhost:$port \
        --request-rate $qps \
        --num-prompts $num_prompts \
      "
    echo "Client command: $client_command"
    eval "$client_command"
    #TODO: process/record outputs
    done
  done
  kill_gpu_processes
 }
 prepare_dataset() {
  # download sharegpt dataset
  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
  # duplicate sonnet by 4x, to allow benchmarking with input length 2048
  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
  echo "" > sonnet_4x.txt
  for _ in {1..4}
  do
    cat sonnet.txt >> sonnet_4x.txt
  done
 }
 main() {
  # check if the environment variable is successfully injected from yaml
  check_gpus
  check_hf_token
  get_current_llm_serving_engine
  pip install -U transformers
  pip install -r requirements/dev.txt
  which genai-perf
  # check storage
  df -h
  ensure_installed wget
  ensure_installed curl
  ensure_installed jq
  # genai-perf dependency
  ensure_installed libb64-0d
  prepare_dataset
  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
  BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
  # run the test
  run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
  # run genai-perf tests
  run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
  mv artifacts/ $RESULTS_FOLDER/
  # upload benchmark results to buildkite
  python3 -m pip install tabulate pandas
  python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
  upload_to_buildkite
 }
 main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -0,0 +1,400 @@
 #!/bin/bash
 # This script should be run inside the CI process
 # This script assumes that we are already inside the vllm/ directory
 # Benchmarking results will be available inside vllm/benchmarks/results/
 # Do not set -e, as the mixtral 8x22B model tends to crash occasionally
 # and we still want to see other benchmarking results even when mixtral crashes.
 set -x
 set -o pipefail
 check_gpus() {
  if command -v nvidia-smi; then
    # check the number of GPUs and GPU type.
    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  elif command -v amd-smi; then
    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
  fi
  if [[ $gpu_count -gt 0 ]]; then
    echo "GPU found."
  else
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
  if command -v nvidia-smi; then
    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
  elif command -v amd-smi; then
    declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
  fi
  echo "GPU type is $gpu_type"
 }
 check_hf_token() {
  # check if HF_TOKEN is available and valid
  if [[ -z "$HF_TOKEN" ]]; then
    echo "Error: HF_TOKEN is not set."
    exit 1
  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
    echo "Error: HF_TOKEN does not start with 'hf_'."
    exit 1
  else
    echo "HF_TOKEN is set and valid."
  fi
 }
 ensure_sharegpt_downloaded() {
  local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
  if [ ! -f "$FILE" ]; then
    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
  else
    echo "$FILE already exists."
  fi
 }
 json2args() {
  # transforms the JSON string to command line args, and '_' is replaced to '-'
  # example:
  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
  local json_string=$1
  local args=$(
    echo "$json_string" | jq -r '
      to_entries |
      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
      join(" ")
    '
  )
  echo "$args"
 }
 wait_for_server() {
  # wait for vllm server to start
  # return 1 if vllm server crashes
  timeout 1200 bash -c '
    until curl -X POST localhost:8000/v1/completions; do
      sleep 1
    done' && return 0 || return 1
 }
 kill_processes_launched_by_current_bash() {
  # Kill all python processes launched from current bash script
  current_shell_pid=$$
  processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}')
  if [ -n "$processes" ]; then
    echo "Killing the following processes matching '$1':"
    echo "$processes"
    echo "$processes" | xargs kill -9
  else
    echo "No processes found matching '$1'."
  fi
 }
 kill_gpu_processes() {
  ps -aux
  lsof -t -i:8000 | xargs -r kill -9
  pgrep python3 | xargs -r kill -9
  # wait until GPU memory usage smaller than 1GB
  if command -v nvidia-smi; then
    while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
      sleep 1
    done
  elif command -v amd-smi; then
    while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
      sleep 1
    done
  fi
  # remove vllm config file
  rm -rf ~/.config/vllm
 }
 upload_to_buildkite() {
  # upload the benchmarking results to buildkite
  # if the agent binary is not found, skip uploading the results, exit 0
  # Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
  if command -v buildkite-agent >/dev/null 2>&1; then
    BUILDKITE_AGENT_COMMAND="buildkite-agent"
  elif [ -f /workspace/buildkite-agent ]; then
    BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
  else
    echo "buildkite-agent binary not found. Skip uploading the results."
    return 0
  fi
  # Use the determined command to annotate and upload artifacts
  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md"
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }
 run_latency_tests() {
  # run latency tests using `benchmark_latency.py`
  # $1: a json file specifying latency test cases
  local latency_test_file
  latency_test_file=$1
  # Iterate over latency tests
  jq -c '.[]' "$latency_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^latency_ ]]; then
      echo "In latency-test.json, test_name must start with \"latency_\"."
      exit 1
    fi
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # get arguments
    latency_params=$(echo "$params" | jq -r '.parameters')
    latency_args=$(json2args "$latency_params")
    # check if there is enough GPU to run the test
    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
    if [[ $gpu_count -lt $tp ]]; then
      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi
    latency_command="python3 benchmark_latency.py \
      --output-json $RESULTS_FOLDER/${test_name}.json \
      $latency_args"
    echo "Running test case $test_name"
    echo "Latency command: $latency_command"
    # recoding benchmarking command ang GPU command
    jq_output=$(jq -n \
      --arg latency "$latency_command" \
      --arg gpu "$gpu_type" \
      '{
        latency_command: $latency,
        gpu_type: $gpu
      }')
    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
    # run the benchmark
    eval "$latency_command"
    kill_gpu_processes
  done
 }
 run_throughput_tests() {
  # run throughput tests using `benchmark_throughput.py`
  # $1: a json file specifying throughput test cases
  local throughput_test_file
  throughput_test_file=$1
  # Iterate over throughput tests
  jq -c '.[]' "$throughput_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^throughput_ ]]; then
      echo "In throughput-test.json, test_name must start with \"throughput_\"."
      exit 1
    fi
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # get arguments
    throughput_params=$(echo "$params" | jq -r '.parameters')
    throughput_args=$(json2args "$throughput_params")
    # check if there is enough GPU to run the test
    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
    if [[ $gpu_count -lt $tp ]]; then
      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi
    throughput_command="python3 benchmark_throughput.py \
      --output-json $RESULTS_FOLDER/${test_name}.json \
      $throughput_args"
    echo "Running test case $test_name"
    echo "Throughput command: $throughput_command"
    # recoding benchmarking command ang GPU command
    jq_output=$(jq -n \
      --arg command "$throughput_command" \
      --arg gpu "$gpu_type" \
      '{
        throughput_command: $command,
        gpu_type: $gpu
      }')
    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
    # run the benchmark
    eval "$throughput_command"
    kill_gpu_processes
  done
 }
 run_serving_tests() {
  # run serving tests using `benchmark_serving.py`
  # $1: a json file specifying serving test cases
  local serving_test_file
  serving_test_file=$1
  # Iterate over serving tests
  jq -c '.[]' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^serving_ ]]; then
      echo "In serving-test.json, test_name must start with \"serving_\"."
      exit 1
    fi
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # get client and server arguments
    server_params=$(echo "$params" | jq -r '.server_parameters')
    client_params=$(echo "$params" | jq -r '.client_parameters')
    server_args=$(json2args "$server_params")
    client_args=$(json2args "$client_params")
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
    # check if there is enough GPU to run the test
    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
    if [[ $gpu_count -lt $tp ]]; then
      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
      continue
    fi
    # check if server model and client model is aligned
    server_model=$(echo "$server_params" | jq -r '.model')
    client_model=$(echo "$client_params" | jq -r '.model')
    if [[ $server_model != "$client_model" ]]; then
      echo "Server model and client model must be the same. Skip testcase $test_name."
      continue
    fi
    server_command="python3 \
      -m vllm.entrypoints.openai.api_server \
      $server_args"
    # run the server
    echo "Running test case $test_name"
    echo "Server command: $server_command"
    bash -c "$server_command" &
    server_pid=$!
    # wait until the server is alive
    if wait_for_server; then
      echo ""
      echo "vllm server is up and running."
    else
      echo ""
      echo "vllm failed to start within the timeout period."
    fi
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
        echo "qps was $qps"
        qps="inf"
        echo "now qps is $qps"
      fi
      new_test_name=$test_name"_qps_"$qps
      # pass the tensor parallel size to the client so that it can be displayed
      # on the benchmark dashboard
      client_command="python3 benchmark_serving.py \
        --save-result \
        --result-dir $RESULTS_FOLDER \
        --result-filename ${new_test_name}.json \
        --request-rate $qps \
        --metadata "tensor_parallel_size=$tp" \
        $client_args"
      echo "Running test case $test_name with qps $qps"
      echo "Client command: $client_command"
      bash -c "$client_command"
      # record the benchmarking commands
      jq_output=$(jq -n \
        --arg server "$server_command" \
        --arg client "$client_command" \
        --arg gpu "$gpu_type" \
        '{
          server_command: $server,
          client_command: $client,
          gpu_type: $gpu
        }')
      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
    done
    # clean up
    kill -9 $server_pid
    kill_gpu_processes
  done
 }
 main() {
  check_gpus
  check_hf_token
  # Set to v1 to run v1 benchmark
  if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
    export VLLM_USE_V1=1
  fi
  # dependencies
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
  (which jq) || (apt-get update && apt-get -y install jq)
  (which lsof) || (apt-get update && apt-get install -y lsof)
  # get the current IP address, required by benchmark_serving.py
  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
  # turn of the reporting of the status of each request, to clean up the terminal output
  export VLLM_LOGGING_LEVEL="WARNING"
  # prepare for benchmarking
  cd benchmarks || exit 1
  ensure_sharegpt_downloaded
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
  # benchmarking
  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
  # postprocess benchmarking results
  pip install tabulate pandas
  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
  upload_to_buildkite
 }
 main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -0,0 +1,81 @@
 # SPDX-License-Identifier: Apache-2.0
 import datetime
 import json
 import os
 from pathlib import Path
 import pandas as pd
 from tabulate import tabulate
 results_folder = Path("results/")
 # serving results and the keys that will be printed into markdown
 serving_results = []
 serving_column_mapping = {
    "test_name": "Test name",
    "gpu_type": "GPU",
    "completed": "Successful req.",
    "request_throughput": "Tput (req/s)",
    "mean_ttft_ms": "Mean TTFT (ms)",
    "std_ttft_ms": "Std TTFT (ms)",
    "median_ttft_ms": "Median TTFT (ms)",
    "mean_itl_ms": "Mean ITL (ms)",
    "std_itl_ms": "Std ITL (ms)",
    "median_itl_ms": "Median ITL (ms)",
    "mean_tpot_ms": "Mean TPOT (ms)",
    "std_tpot_ms": "Std TPOT (ms)",
    "median_tpot_ms": "Median TPOT (ms)",
    "total_token_throughput": "Total Token Tput (tok/s)",
    "output_throughput": "Output Tput (tok/s)",
    "total_input_tokens": "Total input tokens",
    "total_output_tokens": "Total output tokens",
    "engine": "Engine",
 }
 if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):
        with open(test_file) as f:
            raw_result = json.loads(f.read())
        # attach the benchmarking command to raw_result
        with open(test_file.with_suffix(".commands")) as f:
            command = json.loads(f.read())
        raw_result.update(command)
        # update the test name of this result
        raw_result.update({"test_name": test_file.stem})
        # add the result to raw_result
        serving_results.append(raw_result)
        continue
    serving_results = pd.DataFrame.from_dict(serving_results)
    if not serving_results.empty:
        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
            columns=serving_column_mapping
        )
    serving_md_table_with_headers = tabulate(
        serving_results, headers="keys", tablefmt="pipe", showindex=False
    )
    # remove the first line of header
    serving_md_table_lines = serving_md_table_with_headers.split("\n")
    serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
    # document benchmarking results in markdown
    with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
        # document results with header.
        # for those who wants to reproduce our benchmark.
        f.write(serving_md_table_with_headers)
        f.write("\n")
    # document benchmarking results in json
    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
        results = serving_results.to_dict(orient="records")
        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@@ -0,0 +1,23 @@
 #!/bin/sh
 TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
 if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
 else
    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
 fi
 TIMEOUT_SECONDS=10
 retries=0
 while [ $retries -lt 1000 ]; do
    if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
        exit 0
    fi
    echo "Waiting for image to be available..."
    retries=$((retries + 1))
    sleep 5
 done
 exit 1
--- a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
@@ -0,0 +1,23 @@
 [
    {
        "test_name": "llama8B_tp1_genai_perf",
        "qps_list": [4,8,16,32],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
            "tp": 1,
            "port": 8000,
            "num_prompts": 500,
            "reuse_server": false
        },
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
        "genai_perf_input_parameters": {
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/latency-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json
@@ -0,0 +1,32 @@
 [
    {
        "test_name": "latency_llama8B_tp1",
        "parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "num_iters_warmup": 5,
            "num_iters": 15
        }
    },
    {
        "test_name": "latency_llama70B_tp4",
        "parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
            "load_format": "dummy",
            "num-iters-warmup": 5,
            "num-iters": 15
        }
    },
    {
        "test_name": "latency_mixtral8x7B_tp2",
        "parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
            "load_format": "dummy",
            "num-iters-warmup": 5,
            "num-iters": 15
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -0,0 +1,323 @@
 [
    {
        "test_name": "llama8B_tp1_sharegpt",
        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
            "tp": 1,
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 500,
            "port": 8000,
            "reuse_server": false
        },
        "lmdeploy_server_parameters": {
            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
        "tgi_server_parameters": {
        },
        "tgi_client_parameters": {
            "endpoint": "/generate_stream"
        },
        "trt_server_parameters": {
            "model_type": "llama",
            "model_dtype": "bfloat16",
            "max_batch_size": 2048,
            "max_input_len": 4096,
            "max_seq_len": 6144,
            "max_num_tokens": 16384,
            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
        },
        "sglang_server_parameters": {
            "disable_radix_cache": "",
            "enable_torch_compile": "",
            "dtype": "bfloat16"
        },
        "sglang_client_parameters": {
        }
    },
    {
        "test_name": "llama8B_tp1_sonnet_512_16",
        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
            "tp": 1,
            "dataset_name": "sonnet",
            "dataset_path": "./sonnet_4x.txt",
            "num_prompts": 500,
            "port": 8000,
            "sonnet_input_len": 512,
            "sonnet_output_len": 16,
            "sonnet_prefix_len": 50,
            "reuse_server": true
        },
        "lmdeploy_server_parameters": {
            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
        "tgi_server_parameters": {
        },
        "tgi_client_parameters": {
            "endpoint": "/generate_stream"
        },
        "trt_server_parameters": {
            "model_type": "llama",
            "model_dtype": "bfloat16",
            "max_batch_size": 2048,
            "max_input_len": 4096,
            "max_seq_len": 6144,
            "max_num_tokens": 16384,
            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
        },
        "sglang_server_parameters": {
            "disable_radix_cache": "",
            "enable_torch_compile": "",
            "dtype": "bfloat16"
        },
        "sglang_client_parameters": {
        }
    },
    {
        "test_name": "llama8B_tp1_sonnet_512_256",
        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
            "tp": 1,
            "dataset_name": "sonnet",
            "dataset_path": "./sonnet_4x.txt",
            "num_prompts": 500,
            "port": 8000,
            "sonnet_input_len": 512,
            "sonnet_output_len": 256,
            "sonnet_prefix_len": 50,
            "reuse_server": true
        },
        "lmdeploy_server_parameters": {
            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
        "tgi_server_parameters": {
        },
        "tgi_client_parameters": {
            "endpoint": "/generate_stream"
        },
        "trt_server_parameters": {
            "model_type": "llama",
            "model_dtype": "bfloat16",
            "max_batch_size": 2048,
            "max_input_len": 4096,
            "max_seq_len": 6144,
            "max_num_tokens": 16384,
            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
        },
        "sglang_server_parameters": {
            "disable_radix_cache": "",
            "enable_torch_compile": "",
            "dtype": "bfloat16"
        },
        "sglang_client_parameters": {
        }
    },
    {
        "test_name": "llama70B_tp4_sharegpt",
        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "tp": 4,
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 500,
            "port": 8000,
            "reuse_server": false
        },
        "lmdeploy_server_parameters": {
            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
        "tgi_server_parameters": {
        },
        "tgi_client_parameters": {
            "endpoint": "/generate_stream"
        },
        "trt_server_parameters": {
            "model_type": "llama",
            "model_dtype": "bfloat16",
            "max_batch_size": 2048,
            "max_input_len": 4096,
            "max_seq_len": 6144,
            "max_num_tokens": 16384,
            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
        },
        "sglang_server_parameters": {
            "disable_radix_cache": "",
            "dtype": "bfloat16"
        },
        "sglang_client_parameters": {
        }
    },
    {
        "test_name": "llama70B_tp4_sonnet_512_16",
        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "tp": 4,
            "dataset_name": "sonnet",
            "dataset_path": "./sonnet_4x.txt",
            "num_prompts": 500,
            "port": 8000,
            "sonnet_input_len": 512,
            "sonnet_output_len": 16,
            "sonnet_prefix_len": 50,
            "reuse_server": true
        },
        "lmdeploy_server_parameters": {
            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
        "tgi_server_parameters": {
        },
        "tgi_client_parameters": {
            "endpoint": "/generate_stream"
        },
        "trt_server_parameters": {
            "model_type": "llama",
            "model_dtype": "bfloat16",
            "max_batch_size": 2048,
            "max_input_len": 4096,
            "max_seq_len": 6144,
            "max_num_tokens": 16384,
            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
        },
        "sglang_server_parameters": {
            "disable_radix_cache": "",
            "dtype": "bfloat16"
        },
        "sglang_client_parameters": {
        }
    },
    {
        "test_name": "llama70B_tp4_sonnet_512_256",
        "qps_list": [4,8,16,32,"inf"],
        "common_parameters": {
            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "tp": 4,
            "dataset_name": "sonnet",
            "dataset_path": "./sonnet_4x.txt",
            "num_prompts": 500,
            "port": 8000,
            "sonnet_input_len": 512,
            "sonnet_output_len": 256,
            "sonnet_prefix_len": 50,
            "reuse_server": true
        },
        "lmdeploy_server_parameters": {
            "dtype": "bfloat16"
        },
        "lmdeploy_client_parameters": {
        },
        "tgi_server_parameters": {
        },
        "tgi_client_parameters": {
            "endpoint": "/generate_stream"
        },
        "trt_server_parameters": {
            "model_type": "llama",
            "model_dtype": "bfloat16",
            "max_batch_size": 2048,
            "max_input_len": 4096,
            "max_seq_len": 6144,
            "max_num_tokens": 16384,
            "trt_llm_version": "v0.11.0"
        },
        "trt_client_parameters": {
            "endpoint": "/v2/models/ensemble/generate_stream"
        }, 
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "disable_log_requests": "",
            "gpu_memory_utilization": 0.9,
            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
        "vllm_client_parameters": {
        },
        "sglang_server_parameters": {
            "disable_radix_cache": "",
            "dtype": "bfloat16"
        },
        "sglang_client_parameters": {
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@@ -0,0 +1,81 @@
 [
    {
        "test_name": "serving_llama8B_tp1_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "swap_space": 16,
            "disable_log_stats": "",
            "disable_log_requests": "",
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama70B_tp4_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
            "swap_space": 16,
            "disable_log_stats": "",
            "disable_log_requests": "",
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_mixtral8x7B_tp2_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
        "server_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
            "swap_space": 16,
            "disable_log_stats": "",
            "disable_log_requests": "",
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
        "qps_list": [2],
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "disable_log_requests": "", 
            "tensor_parallel_size": 4,
            "swap_space": 16,
            "speculative_config": {
                "model": "turboderp/Qwama-0.5B-Instruct",
                "num_speculative_tokens": 4,
                "draft_tensor_parallel_size": 1
            }
        },
        "client_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200 
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
@@ -0,0 +1,35 @@
 [
    {
        "test_name": "throughput_llama8B_tp1",
        "parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200,
            "backend": "vllm"
        }
    },
    {
        "test_name": "throughput_llama70B_tp4",
        "parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200,
            "backend": "vllm"
        }
    },
    {
        "test_name": "throughput_mixtral8x7B_tp2",
        "parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200,
            "backend": "vllm"
        }
    }
 ]
--- a/.buildkite/pyproject.toml
+++ b/.buildkite/pyproject.toml
@@ -0,0 +1,46 @@
 # This local pyproject file is part of the migration from yapf to ruff format.
 # It uses the same core rules as the main pyproject.toml file, but with the
 # following differences:
 # - ruff line length is overridden to 88
 # - deprecated typing ignores (UP006, UP035) have been removed
 [tool.ruff]
 line-length = 88
 [tool.ruff.lint.per-file-ignores]
 "vllm/third_party/**" = ["ALL"]
 "vllm/version.py" = ["F401"]
 "vllm/_version.py" = ["ALL"]
 [tool.ruff.lint]
 select = [
    # pycodestyle
    "E",
    # Pyflakes
    "F",
    # pyupgrade
    "UP",
    # flake8-bugbear
    "B",
    # flake8-simplify
    "SIM",
    # isort
    "I",
    # flake8-logging-format
    "G",
 ]
 ignore = [
    # star imports
    "F405", "F403",
    # lambda expression assignment
    "E731",
    # Loop control variable not used within loop body
    "B007",
    # f-string format
    "UP032",
    # Can remove once 3.10+ is the minimum Python version
    "UP007",
 ]
 [tool.ruff.format]
 docstring-code-format = true
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -0,0 +1,105 @@
 steps:
  - label: "Build wheel - CUDA 12.8"
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
  - label: "Build wheel - CUDA 12.6"
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
  # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
  # However, this block can be uncommented to save some compute hours.
  # - block: "Build CUDA 11.8 wheel"
  #   key: block-build-cu118-wheel
  - label: "Build wheel - CUDA 11.8"
    # depends_on: block-build-cu118-wheel
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
  - block: "Build release image"
    depends_on: ~
    key: block-release-image-build
  - label: "Build release image"
    depends_on: block-release-image-build
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
  - label: "Build and publish TPU release image"
    depends_on: ~
    if: build.env("NIGHTLY") == "1"
    agents:
      queue: tpu_queue_postmerge
    commands:
      - "yes | docker system prune -a"
      - "git fetch --all"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
      - "docker push vllm/vllm-tpu:nightly"
      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
    plugins:
      - docker-login#v3.0.0:
          username: vllmbot
          password-env: DOCKERHUB_TOKEN
    env:
      DOCKER_BUILDKIT: "1"
  - input: "Provide Release version here"
    fields:
      - text: "What is the release version?"
        key: "release-version"
  - block: "Build CPU release image"
    key: block-cpu-release-image-build
    depends_on: ~
  - label: "Build and publish CPU release image"
    depends_on: block-cpu-release-image-build
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
  - block: "Build Neuron release image"
    key: block-neuron-release-image-build
    depends_on: ~
  - label: "Build and publish Neuron release image"
    depends_on: block-neuron-release-image-build
    agents:
      queue: neuron-postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -1,38 +0,0 @@
 # This script build the ROCm docker image and run the API server inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Print ROCm version
 rocminfo
 # Try building the docker image
 docker build -t rocm -f Dockerfile.rocm .
 # Setup cleanup
 remove_docker_container() { docker rm -f rocm || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 # Run the image
 docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server &
 # Wait for the server to start
 wait_for_server_to_start() {
    timeout=300
    counter=0
    while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
        sleep 1
        counter=$((counter + 1))
        if [ $counter -ge $timeout ]; then
            echo "Timeout after $timeout seconds"
            break
        fi
    done
 }
 wait_for_server_to_start
 # Test a simple prompt
 curl -X POST -H "Content-Type: application/json" \
    localhost:8000/generate \
    -d '{"prompt": "San Francisco is a"}'
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -0,0 +1,241 @@
 #!/bin/bash
 # This script runs test inside the corresponding ROCm docker container.
 set -o pipefail
 # Export Python path
 export PYTHONPATH=".."
 # Print ROCm version
 echo "--- Confirming Clean Initial State"
 while true; do
        sleep 3
        if grep -q clean /opt/amdgpu/etc/gpu_state; then
                echo "GPUs state is \"clean\""
                break
        fi
 done
 echo "--- ROCm info"
 rocminfo
 # cleanup older docker images
 cleanup_docker() {
  # Get Docker's root directory
  docker_root=$(docker info -f '{{.DockerRootDir}}')
  if [ -z "$docker_root" ]; then
    echo "Failed to determine Docker root directory."
    exit 1
  fi
  echo "Docker root directory: $docker_root"
  # Check disk usage of the filesystem where Docker's root directory is located
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
  # Define the threshold
  threshold=70
  if [ "$disk_usage" -gt "$threshold" ]; then
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
    # Remove dangling images (those that are not tagged and not used by any container)
    docker image prune -f
    # Remove unused volumes / force the system prune for old images as well.
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
    echo "Docker images and volumes cleanup completed."
  else
    echo "Disk usage is below $threshold%. No cleanup needed."
  fi
 }
 # Call the cleanup docker function
 cleanup_docker
 echo "--- Resetting GPUs"
 echo "reset" > /opt/amdgpu/etc/gpu_state
 while true; do
        sleep 3
        if grep -q clean /opt/amdgpu/etc/gpu_state; then
                echo "GPUs state is \"clean\""
                break
        fi
 done
 echo "--- Pulling container" 
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"
 remove_docker_container() {
   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
 }
 trap remove_docker_container EXIT
 echo "--- Running container"
 HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
 commands=$@
 echo "Commands:$commands"
 if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
  commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
 fi
 if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 fi
 if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
  commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
 fi
 if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 fi
 #ignore certain kernels tests
 if [[ $commands == *" kernels/core"* ]]; then
  commands="${commands} \
  --ignore=kernels/core/test_fused_quant_layernorm.py \
  --ignore=kernels/core/test_permute_cols.py"
 fi
 if [[ $commands == *" kernels/attention"* ]]; then
  commands="${commands} \
  --ignore=kernels/attention/stest_attention_selector.py \
  --ignore=kernels/attention/test_blocksparse_attention.py \
  --ignore=kernels/attention/test_encoder_decoder_attn.py \
  --ignore=kernels/attention/test_attention_selector.py \
  --ignore=kernels/attention/test_flash_attn.py \
  --ignore=kernels/attention/test_flashinfer.py \
  --ignore=kernels/attention/test_prefix_prefill.py \
  --ignore=kernels/attention/test_cascade_flash_attn.py \
  --ignore=kernels/attention/test_mha_attn.py \
  --ignore=kernels/attention/test_lightning_attn.py \
  --ignore=kernels/attention/test_attention.py"
 fi
 if [[ $commands == *" kernels/quantization"* ]]; then
  commands="${commands} \
  --ignore=kernels/quantization/test_int8_quant.py \
  --ignore=kernels/quantization/test_aqlm.py \
  --ignore=kernels/quantization/test_machete_mm.py \
  --ignore=kernels/quantization/test_block_fp8.py \
  --ignore=kernels/quantization/test_block_int8.py \
  --ignore=kernels/quantization/test_marlin_gemm.py \
  --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
  --ignore=kernels/quantization/test_int8_kernel.py"
 fi
 if [[ $commands == *" kernels/mamba"* ]]; then
  commands="${commands} \
  --ignore=kernels/mamba/test_mamba_mixer2.py \
  --ignore=kernels/mamba/test_causal_conv1d.py \
  --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
 fi
 if [[ $commands == *" kernels/moe"* ]]; then
  commands="${commands} \
  --ignore=kernels/moe/test_moe.py \
  --ignore=kernels/moe/test_cutlass_moe.py \
  --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
 fi
 #ignore certain Entrypoints/openai tests
 if [[ $commands == *" entrypoints/openai "* ]]; then
  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
  --ignore=entrypoints/openai/test_audio.py \
  --ignore=entrypoints/openai/test_shutdown.py \
  --ignore=entrypoints/openai/test_completion.py \
  --ignore=entrypoints/openai/test_sleep.py \
  --ignore=entrypoints/openai/test_models.py \
  --ignore=entrypoints/openai/test_lora_adapters.py \
  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
  --ignore=entrypoints/openai/test_root_path.py \
  --ignore=entrypoints/openai/test_tokenization.py \
  --ignore=entrypoints/openai/test_prompt_validation.py "}
 fi
 #ignore certain Entrypoints/llm tests
 if [[ $commands == *" entrypoints/llm "* ]]; then
  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
  --ignore=entrypoints/llm/test_chat.py \
  --ignore=entrypoints/llm/test_accuracy.py \
  --ignore=entrypoints/llm/test_init.py \
  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi
 #Obsolete currently
 ##ignore certain Entrypoints/llm tests
 #if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
 #  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
 #fi
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
 # --ignore=entrypoints/openai/test_accuracy.py \
 # --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
 PARALLEL_JOB_COUNT=8
 MYPYTHONPATH=".."
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
  # assign job count as the number of shards used   
  commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
    # assign shard-id for each shard
    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
    echo "Shard ${GPU} commands:$commands_gpu"
    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
    docker run \
        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
        --network=host \
        --shm-size=16gb \
        --rm \
        -e HIP_VISIBLE_DEVICES="${GPU}" \
        -e HF_TOKEN \
        -e AWS_ACCESS_KEY_ID \
        -e AWS_SECRET_ACCESS_KEY \
        -v "${HF_CACHE}:${HF_MOUNT}" \
        -e "HF_HOME=${HF_MOUNT}" \
        -e "PYTHONPATH=${MYPYTHONPATH}" \
        --name "${container_name}_${GPU}" \
        "${image_name}" \
        /bin/bash -c "${commands_gpu}" \
        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
    PIDS+=($!)
  done
  #wait for all processes to finish and collect exit codes
  for pid in "${PIDS[@]}"; do
    wait "${pid}"
    STATUS+=($?)
  done
  for st in "${STATUS[@]}"; do
    if [[ ${st} -ne 0 ]]; then
      echo "One of the processes failed with $st"
      exit "${st}"
    fi
  done
 else
  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
  docker run \
          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
          --network=host \
          --shm-size=16gb \
          --rm \
          -e HIP_VISIBLE_DEVICES=0 \
          -e HF_TOKEN \
          -e AWS_ACCESS_KEY_ID \
          -e AWS_SECRET_ACCESS_KEY \
          -v "${HF_CACHE}:${HF_MOUNT}" \
          -e "HF_HOME=${HF_MOUNT}" \
          -e "PYTHONPATH=${MYPYTHONPATH}" \
          --name "${container_name}" \
          "${image_name}" \
          /bin/bash -c "${commands}"
 fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -0,0 +1,48 @@
 #!/bin/bash
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Setup cleanup
 remove_docker_container() {
  if [[ -n "$container_id" ]]; then
      podman rm -f "$container_id" || true
  fi
  podman system prune -f
 }
 trap remove_docker_container EXIT
 remove_docker_container
 # Try building the docker image
 podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
 # Run the image
 container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
 function cpu_tests() {
  # offline inference
  podman exec -it "$container_id" bash -c "
    set -e
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
  # Run basic model test
  podman exec -it "$container_id" bash -c "
    set -e
    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
    pip install sentence-transformers datamodel_code_generator
    pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
    pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
 }
 # All of CPU tests are expected to be finished less than 40 mins.
 export container_id
 export -f cpu_tests
 timeout 40m bash -c cpu_tests
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
@@ -0,0 +1,13 @@
 #!/bin/bash
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Setup cleanup
 remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
 trap remove_docker_container EXIT
 remove_docker_container
 # Try building the docker image
 docker build -t cpu-test -f docker/Dockerfile.s390x .
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -0,0 +1,94 @@
 #!/bin/bash
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 # Setup cleanup
 remove_docker_container() { 
    set -e; 
    docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; 
    docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; 
 }
 trap remove_docker_container EXIT
 remove_docker_container
 # Try building the docker image
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
 --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
 --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
 function cpu_tests() {
  set -e
  export NUMA_NODE=$2
  export BUILDKITE_BUILD_NUMBER=$3
  # offline inference
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
    set -e
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
  # Run basic model test
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
    pytest -v -s tests/kernels/test_cache.py -m cpu_model
    pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
    pytest -v -s tests/models/decoder_only/language -m cpu_model
    pytest -v -s tests/models/embedding/language -m cpu_model
    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
    pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
    pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
  # Run compressed-tensor test
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
  # Run AWQ test
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v \
    tests/quantization/test_ipex_quant.py"
  # Run chunked-prefill and prefix-cache test
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v -k cpu_model \
    tests/basic_correctness/test_chunked_prefill.py"  
  # online serving
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
    export VLLM_CPU_KVCACHE_SPACE=10 
    export VLLM_CPU_OMP_THREADS_BIND=$1
    python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
    timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
    python3 benchmarks/benchmark_serving.py \
      --backend vllm \
      --dataset-name random \
      --model facebook/opt-125m \
      --num-prompts 20 \
      --endpoint /v1/completions \
      --tokenizer facebook/opt-125m"
  # Run multi-lora tests
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v \
    tests/lora/test_qwen2vl.py"
 }
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
 timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@@ -0,0 +1,30 @@
 #!/bin/bash
 # This script build the GH200 docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
 python3 use_existing_torch.py
 # Try building the docker image
 DOCKER_BUILDKIT=1 docker build . \
  --file docker/Dockerfile \
  --target vllm-openai \
  --platform "linux/arm64" \
  -t gh200-test \
  --build-arg max_jobs=66 \
  --build-arg nvcc_threads=2 \
  --build-arg RUN_WHEEL_CHECK=false \
  --build-arg torch_cuda_arch_list="9.0+PTX" \
  --build-arg vllm_fa_cmake_gpu_arches="90-real"
 # Setup cleanup
 remove_docker_container() { docker rm -f gh200-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 # Run the image and test offline inference
 docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
 '
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -0,0 +1,26 @@
 #!/bin/bash
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Try building the docker image
 docker build -t hpu-test-env -f docker/Dockerfile.hpu .
 # Setup cleanup
 # certain versions of HPU software stack have a bug that can
 # override the exit code of the script, so we need to use
 # separate remove_docker_containers and remove_docker_containers_and_exit
 # functions, while other platforms only need one remove_docker_container
 # function.
 EXITCODE=1
 remove_docker_containers() { docker rm -f hpu-test || true; docker rm -f hpu-test-tp2 || true; }
 remove_docker_containers_and_exit() { remove_docker_containers; exit $EXITCODE; }
 trap remove_docker_containers_and_exit EXIT
 remove_docker_containers
 # Run the image and launch offline inference
 docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
 docker run --runtime=habana --name=hpu-test-tp2 --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --tensor-parallel-size 2
 EXITCODE=$?
--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
@@ -0,0 +1,63 @@
 #!/bin/bash
 # This script build the Neuron docker image and run the API server inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -e
 set -v
 image_name="neuron/vllm-ci"
 container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
 HF_TOKEN=$(aws secretsmanager get-secret-value  --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
 NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
 mkdir -p "${NEURON_COMPILE_CACHE_URL}"
 NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
 # Try building the docker image
 aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
 # prune old image and containers to save disk space, and only once a day
 # by using a timestamp file in tmp.
 if [ -f /tmp/neuron-docker-build-timestamp ]; then
    last_build=$(cat /tmp/neuron-docker-build-timestamp)
    current_time=$(date +%s)
    if [ $((current_time - last_build)) -gt 86400 ]; then
        # Remove dangling images (those that are not tagged and not used by any container)
        docker image prune -f
        # Remove unused volumes / force the system prune for old images as well.
        docker volume prune -f && docker system prune -f
        echo "$current_time" > /tmp/neuron-docker-build-timestamp
    fi
 else
    date "+%s" > /tmp/neuron-docker-build-timestamp
 fi
 docker build -t "${image_name}" -f docker/Dockerfile.neuron .
 # Setup cleanup
 remove_docker_container() {
    docker image rm -f "${image_name}" || true;
 }
 trap remove_docker_container EXIT
 # Run the image
 docker run --rm -it --device=/dev/neuron0 --network bridge \
       -v "${HF_CACHE}:${HF_MOUNT}" \
       -e "HF_HOME=${HF_MOUNT}" \
       -e "HF_TOKEN=${HF_TOKEN}" \
       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
       --name "${container_name}" \
       ${image_name} \
       /bin/bash -c "
            python3 /workspace/vllm/examples/offline_inference/neuron.py;
            python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
            for f in /workspace/vllm/tests/neuron/2_core/*.py; do
                echo 'Running test file: '$f;
                python3 -m pytest \$f -v --capture=tee-sys;
            done
       "
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -0,0 +1,103 @@
 #!/bin/bash
 set -xu
 # Build the docker image.
 docker build -f docker/Dockerfile.tpu -t vllm-tpu .
 # Set up cleanup.
 remove_docker_container() { docker rm -f tpu-test || true; }
 trap remove_docker_container EXIT
 # Remove the container that might not be cleaned up in the previous run.
 remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
 docker run --privileged --net host --shm-size=16G -it \
    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
    && python3 -m pip install pytest pytest-asyncio tpu-info \
    && python3 -m pip install lm_eval[api]==0.4.4 \
    && export VLLM_XLA_CACHE_PATH= \
    && export VLLM_USE_V1=1 \
    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
    && echo HARDWARE \
    && tpu-info \
    && { \
        echo TEST_0: Running test_perf.py; \
        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
        echo TEST_0_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_1: Running test_compilation.py; \
        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
        echo TEST_1_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_2: Running test_basic.py; \
        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
        echo TEST_2_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
        python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
        echo TEST_3_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_4: Running test_quantization_accuracy.py; \
        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
        echo TEST_4_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_5: Running examples/offline_inference/tpu.py; \
        python3 /workspace/vllm/examples/offline_inference/tpu.py; \
        echo TEST_5_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_6: Running test_tpu_model_runner.py; \
        python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
        echo TEST_6_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_7: Running test_sampler.py; \
        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
        echo TEST_7_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_8: Running test_topk_topp_sampler.py; \
        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
        echo TEST_8_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_9: Running test_multimodal.py; \
        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
        echo TEST_9_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_10: Running test_pallas.py; \
        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
        echo TEST_10_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_11: Running test_struct_output_generate.py; \
        python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
        echo TEST_11_EXIT_CODE: \$?; \
    } & \
    { \
        echo TEST_12: Running test_moe_pallas.py; \
        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
        echo TEST_12_EXIT_CODE: \$?; \
    } & \
    # Disable the TPU LoRA tests until the feature is activated
    # & { \
    #     echo TEST_13: Running test_moe_pallas.py; \
    #     python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \
    #     echo TEST_13_EXIT_CODE: \$?; \
    # } & \
    wait \
    && echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
 "
 # TODO: This test fails because it uses RANDOM_SEED sampling
 # && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -0,0 +1,31 @@
 #!/bin/bash
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 # Try building the docker image
 docker build -t ${image_name} -f docker/Dockerfile.xpu .
 # Setup cleanup
 remove_docker_container() { 
  docker rm -f "${container_name}" || true; 
  docker image rm -f "${image_name}" || true;
  docker system prune -f || true;
 }
 trap remove_docker_container EXIT
 # Run the image and test offline inference/tensor parallel
 docker run \
    --device /dev/dri \
    -v /dev/dri/by-path:/dev/dri/by-path \
    --entrypoint="" \
    --name "${container_name}" \
    "${image_name}" \
    sh -c '
    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
 '
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@@ -1,18 +1,20 @@
 #!/bin/bash
 # This script is run by buildkite to run the benchmarks and upload the results to buildkite
 set -ex
 set -o pipefail
-# cd into parent directory of this file
+# cd 2 levels into the working directory
-cd "$(dirname "${BASH_SOURCE[0]}")/.."
+cd "$(dirname "${BASH_SOURCE[0]}")/../.."
 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 # run python-based benchmarks and upload the result to buildkite
-python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
+python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
 bench_latency_exit_code=$?
-python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
+python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
 bench_throughput_exit_code=$?
 # run server-based benchmarks and upload the result to buildkite
@@ -50,11 +52,16 @@ echo "### Serving Benchmarks" >> benchmark_results.md
 sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
 echo "" >> benchmark_results.md
 echo '```' >> benchmark_results.md
-tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
+tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
 echo '```' >> benchmark_results.md
 # if the agent binary is not found, skip uploading the results, exit 0
 if [ ! -f /usr/bin/buildkite-agent ]; then
    exit 0
 fi
 # upload the results to buildkite
-/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
+buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
 # exit with the exit code of the benchmarks
 if [ $bench_latency_exit_code -ne 0 ]; then
@@ -69,4 +76,5 @@ if [ $bench_serving_exit_code -ne 0 ]; then
    exit $bench_serving_exit_code
 fi
-/workspace/buildkite-agent artifact upload openai-*.json
+rm ShareGPT_V3_unfiltered_cleaned_split.json
 buildkite-agent artifact upload "*.json"
--- a/.buildkite/scripts/run-multi-node-test.sh
+++ b/.buildkite/scripts/run-multi-node-test.sh
@@ -0,0 +1,108 @@
 #!/bin/bash
 set -euox pipefail
 if [[ $# -lt 4 ]]; then
    echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
    exit 1
 fi
 WORKING_DIR=$1
 NUM_NODES=$2
 NUM_GPUS=$3
 DOCKER_IMAGE=$4
 shift 4
 COMMANDS=("$@")
 if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then
    echo "The number of commands must be equal to the number of nodes."
    echo "Number of nodes: $NUM_NODES"
    echo "Number of commands: ${#COMMANDS[@]}"
    exit 1
 fi
 echo "List of commands"
 for command in "${COMMANDS[@]}"; do
    echo "$command"
 done
 start_network() {
    docker network create --subnet=192.168.10.0/24 docker-net
 }
 start_nodes() {
    for node in $(seq 0 $(($NUM_NODES-1))); do
        GPU_DEVICES='"device='
        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
            GPU_DEVICES+=$(($DEVICE_NUM))
            if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
                GPU_DEVICES+=','
            fi
        done
        GPU_DEVICES+='"'
        # start the container in detached mode
        # things to note:
        # 1. --shm-size=10.24gb is required. don't use --ipc=host
        # 2. pass HF_TOKEN to the container
        # 3. map the huggingface cache directory to the container
        # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
        #    starting from 192.168.10.11)
        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
            -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
            --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
            /bin/bash -c "tail -f /dev/null"
        # organize containers into a ray cluster
        if [ "$node" -eq 0 ]; then
            # start the ray head node
            docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block"
            # wait for the head node to be ready
            sleep 10
        else
            # start the ray worker nodes, and connect them to the head node
            docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
        fi
    done
    # wait for the cluster to be ready
    sleep 10
    # print the cluster status
    docker exec node0 /bin/bash -c "ray status"
 }
 run_nodes() {
    # important: iterate in reverse order to start the head node last
    # we start the worker nodes first, in detached mode, and then start the head node
    # in the foreground, so that the output of the head node is visible in the buildkite logs
    for node in $(seq $(($NUM_NODES - 1)) -1 0); do
        GPU_DEVICES='"device='
        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
            GPU_DEVICES+=$(($DEVICE_NUM))
            if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
                GPU_DEVICES+=','
            fi
        done
        GPU_DEVICES+='"'
        echo "Running node$node with GPU devices: $GPU_DEVICES"
        if [ "$node" -ne 0 ]; then
            docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
        else
            docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
        fi
    done
 }
 cleanup() {
    for node in $(seq 0 $(($NUM_NODES-1))); do
        docker stop "node$node"
    done
    docker network rm docker-net
 }
 trap cleanup EXIT
 start_network
 start_nodes
 run_nodes
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -0,0 +1,78 @@
 #!/usr/bin/env bash
 set -ex
 # Assume wheels are in artifacts/dist/*.whl
 wheel_files=(artifacts/dist/*.whl)
 # Check that exactly one wheel is found
 if [[ ${#wheel_files[@]} -ne 1 ]]; then
  echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
  exit 1
 fi
 # Get the single wheel file
 wheel="${wheel_files[0]}"
 # Rename 'linux' to 'manylinux1' in the wheel filename
 new_wheel="${wheel/linux/manylinux1}"
 mv -- "$wheel" "$new_wheel"
 wheel="$new_wheel"
 # Extract the version from the wheel
 version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
 echo "Version: $version"
 normal_wheel="$wheel" # Save the original wheel filename
 # If the version contains "dev", rename it to v1.0.0.dev for consistency
 if [[ $version == *dev* ]]; then
    suffix="${version##*.}"
    if [[ $suffix == cu* ]]; then
        new_version="1.0.0.dev+${suffix}"
    else
        new_version="1.0.0.dev"
    fi
    new_wheel="${wheel/$version/$new_version}"
    # use cp to keep both files in the artifacts directory
    cp -- "$wheel" "$new_wheel"
    wheel="$new_wheel"
    version="$new_version"
 fi
 # Upload the wheel to S3
 python3 .buildkite/generate_index.py --wheel "$normal_wheel"
 # generate index for this commit
 aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 if [[ $normal_wheel == *"cu118"* ]]; then
    # if $normal_wheel matches cu118, do not upload the index.html
    echo "Skipping index files for cu118 wheels"
 elif [[ $normal_wheel == *"cu126"* ]]; then
    # if $normal_wheel matches cu126, do not upload the index.html
    echo "Skipping index files for cu126 wheels"
 else
    # only upload index.html for cu128 wheels (default wheels)
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 fi
 # generate index for nightly
 aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 if [[ $normal_wheel == *"cu118"* ]]; then
    # if $normal_wheel matches cu118, do not upload the index.html
    echo "Skipping index files for cu118 wheels"
 elif [[ $normal_wheel == *"cu126"* ]]; then
    # if $normal_wheel matches cu126, do not upload the index.html
    echo "Skipping index files for cu126 wheels"
 else
    # only upload index.html for cu128 wheels (default wheels)
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 fi
 aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
 aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1,97 +1,763 @@
 # In this file, you can add more tests to run either by adding a new step or
 # adding a new command to an existing step. See different options here for examples.
-# This script will be feed into Jinja template in `test-template.j2` to generate
+
-# the final pipeline yaml file.
+# This script will be feed into Jinja template in `test-template-aws.j2` at
 # https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
 # to generate the final pipeline yaml file.
 # Documentation
 # label(str): the name of the test. emoji allowed.
 # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
 # torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
 # fast_check_only(bool): run this test on fastcheck pipeline only
 # optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
 # command(str): the single command to run for tests. incompatible with commands.
 # commands(list): the list of commands to run for test. incompatbile with command.
 # mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
 # gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
 # num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
 # num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
 #     in this case, commands must be specified. the first command runs on first host, the second
 #     command runs on the second host.
 # working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
 # source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
 # When adding a test
 # - If the test belong to an existing group, add it there
 # - If the test is short, add to any existing step
 # - If the test takes more than 10min, then it is okay to create a new step.
 #   Note that all steps execute in parallel.
 steps:
- label: Regression Test
+##### fast check tests  #####
-  command: pytest -v -s test_regression.py
+
 - label: Documentation Build # 2min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/test_docs"
  fast_check: true
  no_gpu: True
  commands:
  - pip install -r ../requirements/docs.txt
  # TODO: add `--strict` once warnings in docstrings are fixed
  - mkdocs build
 - label: Async Engine, Inputs, Utils, Worker Test # 24min
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
  - tests/mq_llm_engine
  - tests/async_engine
  - tests/test_inputs
  - tests/multimodal
  - tests/test_utils
  - tests/worker
  - tests/standalone_tests/lazy_imports.py
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s mq_llm_engine # MQLLMEngine
  - pytest -v -s async_engine # AsyncLLMEngine
  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s multimodal
  - pytest -v -s test_utils.py # Utils
  - pytest -v -s worker # Worker
 - label: Python-only Installation Test
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - tests/standalone_tests/python_only_compile.sh
  - setup.py
  commands:
  - bash standalone_tests/python_only_compile.sh
 - label: Basic Correctness Test # 30min
  mirror_hardwares: [amdexperimental, amdproduction]
  fast_check: true
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/basic_correctness/test_basic_correctness
  - tests/basic_correctness/test_cpu_offload
  - tests/basic_correctness/test_preemption
  - tests/basic_correctness/test_cumem.py
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s basic_correctness/test_cumem.py
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 - label: Chunked Prefill Test
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
  - tests/basic_correctness/test_chunked_prefill
  commands:
  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
 - label: Core Test # 10min
  mirror_hardwares: [amdexperimental, amdproduction]
  fast_check: true
  source_file_dependencies:
  - vllm/core
  - vllm/distributed
  - tests/core
  commands:
  - pytest -v -s core
 - label: Entrypoints Test # 40min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  fast_check: true
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/llm
  - tests/entrypoints/openai
  - tests/entrypoints/test_chat_utils
  - tests/entrypoints/offline_mode
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
  - pytest -v -s entrypoints/test_chat_utils.py
  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 - label: Distributed Tests (4 GPUs) # 10min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
  - vllm/core/
  - tests/distributed/test_utils
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
  - tests/spec_decode/e2e/test_integration_dist_tp4
  - tests/compile/test_basic_correctness
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/test_async_llm_dp.py
  commands:
  # test with tp=2 and external_dp=2
  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with tp=2 and pp=2
  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  - pushd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd
 - label: Metrics, Tracing Test # 10min
  mirror_hardwares: [amdexperimental, amdproduction]
  num_gpus: 2
  source_file_dependencies:
  - vllm/
  - tests/metrics
  - tests/tracing
  commands:
  - pytest -v -s metrics
  - pytest -v -s tracing
 ##### fast check tests  #####
 #####  1 GPU test  #####
 - label: Regression Test # 5min
  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - vllm/
  - tests/test_regression
  commands:
  - pip install modelscope
  - pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional
- label: AsyncEngine Test
+- label: Engine Test # 10min
-  command: pytest -v -s async_engine
+  mirror_hardwares: [amdexperimental, amdproduction]
-
+  source_file_dependencies:
- label: Basic Correctness Test
+  - vllm/
-  command: pytest -v -s basic_correctness
+  - tests/engine
-
+  - tests/tokenization
- label: Core Test
+  - tests/test_sequence
-  command: pytest -v -s core
+  - tests/test_config
-
+  - tests/test_logger
 - label: Distributed Comm Ops Test
  command: pytest -v -s test_comm_ops.py
  working_dir: "/vllm-workspace/tests/distributed"
  num_gpus: 2 # only support 1 or 2 for now.
 - label: Distributed Tests
  working_dir: "/vllm-workspace/tests/distributed"
  num_gpus: 2 # only support 1 or 2 for now.
  commands:
-  - pytest -v -s test_pynccl.py
+  - pytest -v -s engine test_sequence.py test_config.py test_logger.py
-  - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
+  # OOM in the CI unless we run this separately
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
+  - pytest -v -s tokenization
- label: Engine Test
+- label: V1 Test
-  command: pytest -v -s engine tokenization test_sequence.py test_config.py
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
    - vllm/
    - tests/v1
  commands:
    # split the test to avoid interference
    - pytest -v -s v1/core
    - pytest -v -s v1/engine
    - pytest -v -s v1/entrypoints
    - pytest -v -s v1/sample
    - pytest -v -s v1/worker
    - pytest -v -s v1/structured_output
    - pytest -v -s v1/spec_decode
    - pytest -v -s v1/kv_connector/unit
    - pytest -v -s v1/test_serial_utils.py
    - pytest -v -s v1/test_utils.py
    - pytest -v -s v1/test_oracle.py
    - pytest -v -s v1/test_metrics_reader.py
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
    - pytest -v -s v1/e2e
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
- label: Entrypoints Test
+- label: Examples Test # 25min
-  command: pytest -v -s entrypoints
+  mirror_hardwares: [amdexperimental]
 - label: Examples Test
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
  - vllm/entrypoints
  - examples/
  commands:
-    # install aws cli for llava_example.py
+    - pip install tensorizer # for tensorizer test
-    - pip install awscli
+    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
-    - python3 offline_inference.py
+    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference_with_prefix.py
+    - python3 offline_inference/basic/chat.py
-    - python3 llm_engine_example.py
+    - python3 offline_inference/prefix_caching.py
-    - python3 llava_example.py
+    - python3 offline_inference/llm_engine_example.py
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_embedding.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder.py
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
- label: Kernels Test %N
+- label: Prefix Caching Test # 9min
-  command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  mirror_hardwares: [amdexperimental, amdproduction]
-  parallelism: 4
+  source_file_dependencies:
-
+  - vllm/
- label: Models Test
+  - tests/prefix_caching
  commands:
    - bash ../.buildkite/download-images.sh
    - pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
 - label: Llava Test
  commands:
    - bash ../.buildkite/download-images.sh
    - pytest -v -s models/test_llava.py
 - label: Prefix Caching Test
  commands:
    - pytest -v -s prefix_caching
- label: Samplers Test
+- label: Samplers Test # 36min
-  command: pytest -v -s samplers
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor/layers
  - vllm/sampling_metadata.py
  - tests/samplers
  - tests/conftest.py
  commands:
    - pytest -v -s samplers
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
- label: LogitsProcessor Test
+- label: LogitsProcessor Test # 5min
-  command: pytest -v -s test_logits_processor.py
+  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - vllm/model_executor/layers
  - vllm/model_executor/guided_decoding
  - tests/test_logits_processor
  - tests/model_executor/test_guided_processors
  commands:
    - pytest -v -s test_logits_processor.py
    - pytest -v -s model_executor/test_guided_processors.py
- label: Worker Test
+- label: Speculative decoding tests # 40min
-  command: pytest -v -s worker
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/spec_decode
  - tests/spec_decode
  - vllm/model_executor/models/eagle.py
  commands:
    - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
    - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
- label: Speculative decoding tests
+- label: LoRA Test %N # 15min each
-  command: pytest -v -s spec_decode
+  mirror_hardwares: [amdexperimental]
-
+  source_file_dependencies:
- label: LoRA Test %N
+  - vllm/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  - tests/lora
  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
  parallelism: 4
- label: Metrics Test
+- label: PyTorch Compilation Unit Tests
-  command: pytest -v -s metrics
+  mirror_hardwares: [amdexperimental, amdproduction]
  torch_nightly: true
  source_file_dependencies:
    - vllm/
    - tests/compile
  commands:
    - pytest -v -s compile/test_pass_manager.py
    - pytest -v -s compile/test_fusion.py
    - pytest -v -s compile/test_silu_mul_quant_fusion.py
    - pytest -v -s compile/test_sequence_parallelism.py
    - pytest -v -s compile/test_async_tp.py
- label: Benchmarks
+- label: PyTorch Fullgraph Smoke Test # 9min
  mirror_hardwares: [amdexperimental, amdproduction]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
  - pytest -v -s compile/test_basic_correctness.py
  # these tests need to be separated, cannot combine
  - pytest -v -s compile/piecewise/test_simple.py
  - pytest -v -s compile/piecewise/test_toy_llama.py
 - label: PyTorch Fullgraph Test # 18min
  mirror_hardwares: [amdexperimental, amdproduction]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
  - pytest -v -s compile/test_full_graph.py
 - label: Kernels Core Operation Test
  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - csrc/
  - tests/kernels/core
  commands:
    - pytest -v -s kernels/core
 - label: Kernels Attention Test %N
  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - csrc/attention/
  - vllm/attention
  - vllm/v1/attention
  - tests/kernels/attention
  commands:
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2
 - label: Kernels Quantization Test %N
  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - csrc/quantization/
  - vllm/model_executor/layers/quantization
  - tests/kernels/quantization
  commands:
    - pytest -v -s kernels/quantization  --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2
 - label: Kernels MoE Test
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/moe/
  - tests/kernels/moe
  - vllm/model_executor/layers/fused_moe/
  commands:
    - pytest -v -s kernels/moe
 - label: Kernels Mamba Test
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/mamba/
  - tests/kernels/mamba
  commands:
    - pytest -v -s kernels/mamba
 - label: Tensorizer Test # 11min
  mirror_hardwares: [amdexperimental, amdproduction]
  soft_fail: true
  source_file_dependencies:
  - vllm/model_executor/model_loader
  - tests/tensorizer_loader
  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
  commands:
    - apt-get update && apt-get install -y curl libsodium23
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s tensorizer_loader
    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
 - label: Benchmarks # 9min
  mirror_hardwares: [amdexperimental, amdproduction]
  working_dir: "/vllm-workspace/.buildkite"
  source_file_dependencies:
  - benchmarks/
  commands:
-  - pip install aiohttp
+  - bash scripts/run-benchmarks.sh
  - bash run-benchmarks.sh
- label: Documentation Build
+- label: Benchmarks CLI Test # 10min
-  working_dir: "/vllm-workspace/docs"
+  mirror_hardwares: [amdexperimental, amdproduction]
-  no_gpu: True
+  source_file_dependencies:
  - vllm/
  - tests/benchmarks/
  commands:
-  - pip install -r requirements-docs.txt
+  - pytest -v -s benchmarks/
-  - SPHINXOPTS=\"-W\" make html
+
 - label: Quantization Test
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - tests/quantization
  commands:
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
 - label: LM Eval Small Models # 53min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
 - label: OpenAI API correctness
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/
  - vllm/entrypoints/openai/
  - vllm/model_executor/models/whisper.py
  commands: # LMEval+Transcription WER check
  - pytest -s entrypoints/openai/correctness/
 - label: Encoder Decoder tests # 5min
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
  - tests/encoder_decoder
  commands:
    - pytest -v -s encoder_decoder
 - label: OpenAI-Compatible Tool Use # 20 min
  mirror_hardwares: [amdexperimental]
  fast_check: false
  source_file_dependencies:
    - vllm/
    - tests/tool_use
    - tests/mistral_tool_use
  commands:
    - pytest -v -s tool_use
    - pytest -v -s mistral_tool_use
 #####  models test  #####
 - label: Basic Models Test # 24min
  mirror_hardwares: [amdexperimental, amdproduction]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/models
  commands:
    - pytest -v -s models/test_transformers.py
    - pytest -v -s models/test_registry.py
    - pytest -v -s models/test_utils.py
    - pytest -v -s models/test_vision.py
    - pytest -v -s models/test_initialization.py
 - label: Language Models Test (Standard)
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/models/language
  commands:
    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
    - pip freeze | grep -E 'torch'
    - pytest -v -s models/language -m core_model
 - label: Language Models Test (Extended Generation) # 1hr20min
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/generation
  commands:
    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
    - pytest -v -s models/language/generation -m 'not core_model'
 - label: Language Models Test (Extended Pooling)  # 36min
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/pooling
  commands:
    - pytest -v -s models/language/pooling -m 'not core_model'
 - label: Multi-Modal Models Test (Standard)
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pip freeze | grep -E 'torch'
    - pytest -v -s models/multimodal/processing
    - pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
    - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 - label: Multi-Modal Models Test (Extended) 1
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
 - label: Multi-Modal Models Test (Extended) 2
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
 - label: Multi-Modal Models Test (Extended) 3
  mirror_hardwares: [amdexperimental, amdproduction]
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
 - label: Quantized Models Test
  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - vllm/model_executor/layers/quantization
  - tests/models/quantization
  commands:
    - pytest -v -s models/quantization
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
  mirror_hardwares: [amdexperimental, amdproduction]
  optional: true
  commands:
    - echo 'Testing custom models...'
    # PR authors can temporarily add commands below to test individual models
    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
 #####  1 GPU test  #####
 #####  multi gpus test  #####
 - label: Distributed Comm Ops Test # 7min
  mirror_hardwares: [amdexperimental, amdproduction]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/distributed
  - tests/distributed
  commands:
  - pytest -v -s distributed/test_comm_ops.py
  - pytest -v -s distributed/test_shm_broadcast.py
 - label: 2 Node Tests (4 GPUs in total) # 16min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  num_nodes: 2
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/model_executor/models/
  - tests/distributed/
  commands:
  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
 - label: Distributed Tests (2 GPUs) # 40min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/model_executor/models/
  - tests/distributed/
  - vllm/compilation
  - vllm/worker/worker_base.py
  - vllm/worker/worker.py
  - vllm/worker/model_runner.py
  - entrypoints/llm/test_collective_rpc.py
  - tests/v1/test_async_llm_dp.py
  - vllm/v1/engine/
  commands:
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  # Avoid importing model tests that cause CUDA reinitialization error
  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
  # test sequence parallel
  - pytest -v -s distributed/test_sequence_parallel.py
  # this test fails consistently.
  # TODO: investigate and fix
  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
 - label: Plugin Tests (2 GPUs) # 40min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/plugins/
  - tests/plugins/
  commands:
  # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
  - pip install -e ./plugins/vllm_add_dummy_platform
  - pytest -v -s plugins_tests/test_platform_plugins.py
  - pip uninstall vllm_add_dummy_platform -y
  # end platform plugin tests
  # other tests continue here:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
  - pytest -v -s distributed/test_distributed_oot.py
  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
 - label: Multi-step Tests (4 GPUs) # 36min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
  - vllm/model_executor/layers/sampler.py
  - vllm/sequence.py
  - vllm/worker/worker_base.py
  - vllm/worker/worker.py
  - vllm/worker/multi_step_worker.py
  - vllm/worker/model_runner_base.py
  - vllm/worker/model_runner.py
  - vllm/worker/multi_step_model_runner.py
  - vllm/engine
  - tests/multi_step
  commands:
  # this test is quite flaky
  # TODO: investigate and fix.
  # - pytest -v -s multi_step/test_correctness_async_llm.py
  - pytest -v -s multi_step/test_correctness_llm.py
 - label: Pipeline Parallelism Test # 45min
  mirror_hardwares: [amdexperimental, amdproduction]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/model_executor/models/
  - tests/distributed/
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
  - pytest -v -s distributed/test_pipeline_parallel.py
 - label: LoRA TP Test (Distributed)
  mirror_hardwares: [amdexperimental, amdproduction]
  num_gpus: 4
  source_file_dependencies:
  - vllm/lora
  - tests/lora
  commands:
    # FIXIT: find out which code initialize cuda before running the test
    # before the fix, we need to use spawn to test it
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    # There is some Tensor Parallelism related processing logic in LoRA that
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
 - label: Weight Loading Multiple GPU Test  # 33min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
 - label: Weight Loading Multiple GPU Test - Large Models # optional
  mirror_hardwares: [amdexperimental] 
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  gpu: a100
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
 ##### multi gpus test #####
 ##### A100 test #####
 - label: Distributed Tests (A100) # optional
  gpu: a100
  optional: true
  num_gpus: 4
  source_file_dependencies:
  - vllm/
  commands:
  # NOTE: don't test llama model here, it seems hf implementation is buggy
  # see https://github.com/vllm-project/vllm/pull/5689 for details
  - pytest -v -s distributed/test_custom_all_reduce.py
  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py
 - label: LM Eval Large Models # optional
  gpu: a100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -1,66 +0,0 @@
 {% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
 {% set default_num_gpu = 1 %}
 {% set default_working_dir = "/vllm-workspace/tests" %}
 steps:
  - label: "AMD Test"
    agents:
      queue: amd
    command: bash .buildkite/run-amd-test.sh
  - label: ":docker: build image"
    commands:
      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
      - "docker push {{ docker_image }}"
    env:
      DOCKER_BUILDKIT: "1"
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
          limit: 5
  - wait
  {% for step in steps %}
  - label: "{{ step.label }}"
    agents:
      queue: kubernetes
    soft_fail: {{ step.soft_fail or false }}
    {% if step.parallelism %}
    parallelism: {{ step.parallelism }}
    {% endif %}
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
          limit: 5
    plugins:
      - kubernetes:
          podSpec:
            volumes:
              - name: dshm
                emptyDir:
                  medium: Memory
            containers:
              - image: "{{ docker_image }}"
                command: ["bash"]
                args:
                - '-c'
                - "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'"
                {% if not step.no_gpu %}
                resources:
                  requests:
                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
                  limits:
                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
                {% endif %}
                env:
                  - name: VLLM_USAGE_SOURCE
                    value: ci-test
                  - name: HF_TOKEN
                    valueFrom:
                      secretKeyRef:
                        name: hf-token-secret
                        key: token
                volumeMounts:
                  - mountPath: /dev/shm
                    name: dshm
  {% endfor %}
--- a/.clang-format
+++ b/.clang-format
@@ -0,0 +1,26 @@
 BasedOnStyle: Google
 UseTab: Never
 IndentWidth: 2
 ColumnLimit: 80
 # Force pointers to the type for C++.
 DerivePointerAlignment: false
 PointerAlignment: Left
 # Reordering #include statements can (and currently will) introduce errors
 SortIncludes: false
 # Style choices
 AlignConsecutiveAssignments: false
 AlignConsecutiveDeclarations: false
 IndentPPDirectives: BeforeHash
 IncludeCategories:
  - Regex:           '^<'
    Priority:        4
  - Regex:           '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
    Priority:        3
  - Regex:           '^"(qoda|\.\.)/'
    Priority:        2
  - Regex:           '.*'
    Priority:        1
--- a/.dockerignore
+++ b/.dockerignore
@@ -1 +1,33 @@
 /.venv
 /build
 dist
 vllm/*.so
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 .mypy_cache
 # Distribution / packaging
 .Python
 /build/
 cmake-build-*/
 CMakeUserPresets.json
 develop-eggs/
 /dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -0,0 +1,48 @@
 # See https://help.github.com/articles/about-codeowners/
 # for more info about CODEOWNERS file
 # This lists cover the "core" components of vLLM that require careful review
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
 /vllm/model_executor/guided_decoding @mgoin @russellb
 /vllm/multimodal @DarkLight1337 @ywang96
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
 CMakeLists.txt @tlrmchlsmth
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 /vllm/v1/structured_output @mgoin @russellb
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
 /tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
 /tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
 /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
 /tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
 /tests/kernels @tlrmchlsmth @WoosukKwon
 /tests/model_executor/test_guided_processors.py @mgoin @russellb
 /tests/models @DarkLight1337 @ywang96
 /tests/multi_step @alexm-redhat @comaniac
 /tests/multimodal @DarkLight1337 @ywang96
 /tests/prefix_caching @comaniac @KuntaiDu
 /tests/quantization @mgoin @robertgshaw2-redhat
 /tests/spec_decode @njhill @LiuXiaoxuanPKU
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
 /tests/v1/structured_output @mgoin @russellb
 /tests/weight_loading @mgoin @youkaichao
 /tests/lora @jeejeelee
 # Docs
 /docs @hmellor
 mkdocs.yaml @hmellor
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -0,0 +1,2 @@
 github: [vllm-project]
 open_collective: vllm
--- a/.github/ISSUE_TEMPLATE/100-documentation.yml
+++ b/.github/ISSUE_TEMPLATE/100-documentation.yml
@@ -20,3 +20,10 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
    label: Before submitting a new issue...
    options:
      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
        required: true
--- a/.github/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@@ -14,10 +14,11 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
    value: |
      ```text
      The output of `python collect_env.py`
@@ -37,3 +38,10 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
    label: Before submitting a new issue...
    options:
      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
        required: true
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@@ -14,10 +14,11 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
    value: |
      ```text
      The output of `python collect_env.py`
@@ -35,3 +36,10 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
    label: Before submitting a new issue...
    options:
      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
        required: true
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@@ -14,14 +14,20 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
    value: |
      <details>
      <summary>The output of <code>python collect_env.py</code></summary>
      ```text
-      The output of `python collect_env.py`
+      Your output of `python collect_env.py` here
      ```
      </details>
  validations:
    required: true
 - type: textarea
@@ -57,6 +63,10 @@ body:
      If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
      Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
      Please set the environment variable `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging to help debugging potential issues.
      If you experienced crashes or hangs, it would be helpful to run vllm with `export VLLM_TRACE_FUNCTION=1` . All the function calls in vllm will be recorded. Inspect these log files, and tell which function crashes or hangs.
    placeholder: |
      A clear and concise description of what the bug is.
@@ -65,17 +75,24 @@ body:
      ```
      ```
-      The error message you got, with the full traceback.
+      The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
      ```
  validations:
    required: true
 - type: markdown
  attributes:
-    value: >
+    value: |
-      ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
+      ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output:
      - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
      - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
-      Thanks for contributing 🎉!
+      Thanks for reporting 🙏!
 - type: checkboxes
  id: askllm
  attributes:
    label: Before submitting a new issue...
    options:
      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
        required: true
--- a/.github/ISSUE_TEMPLATE/450-ci-failure.yml
+++ b/.github/ISSUE_TEMPLATE/450-ci-failure.yml
@@ -0,0 +1,69 @@
 name: 🧪 CI failure report
 description: Report a failing test.
 title: "[CI Failure]: "
 labels: ["ci-failure"]
 body:
 - type: markdown
  attributes:
    value: >
      #### Include the name of the failing Buildkite step and test file in the title.
 - type: input
  attributes:
    label: Name of failing test
    description: |
      Paste in the fully-qualified name of the failing test from the logs.
    placeholder: |
      `path/to/test_file.py::test_name[params]`
  validations:
    required: true
 - type: checkboxes
  attributes:
    label: Basic information
    description: Select all items that apply to the failing test.
    options:
      - label: Flaky test
      - label: Can reproduce locally
      - label: Caused by external libraries (e.g. bug in `transformers`)
 - type: textarea
  attributes:
    label: 🧪 Describe the failing test
    description: |
      Please provide a clear and concise description of the failing test.
    placeholder: |
      A clear and concise description of the failing test.
      ```
      The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
      ```
  validations:
    required: true
 - type: textarea
  attributes:
    label: 📝 History of failing test
    description: |
      Since when did the test start to fail?
      You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main).
      If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods:
      - Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally.
      - Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally.
      - Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only)
    placeholder: |
      Approximate timeline and/or problematic PRs
      A link to the Buildkite analytics of the failing test (if available)
  validations:
    required: true
 - type: textarea
  attributes:
    label: CC List.
    description: >
      The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test.
 - type: markdown
  attributes:
    value: >
      Thanks for reporting 🙏!
--- a/.github/ISSUE_TEMPLATE/500-feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/500-feature-request.yml
@@ -29,3 +29,10 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
    label: Before submitting a new issue...
    options:
      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
        required: true
--- a/.github/ISSUE_TEMPLATE/600-new-model.yml
+++ b/.github/ISSUE_TEMPLATE/600-new-model.yml
@@ -9,7 +9,7 @@ body:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
-      #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
+      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
 - type: textarea
  attributes:
    label: The model to consider.
@@ -31,3 +31,10 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
    label: Before submitting a new issue...
    options:
      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
        required: true
--- a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
@@ -35,10 +35,11 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
    value: |
      ```text
      The output of `python collect_env.py`
@@ -49,3 +50,10 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
    label: Before submitting a new issue...
    options:
      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
        required: true
--- a/.github/ISSUE_TEMPLATE/750-RFC.yml
+++ b/.github/ISSUE_TEMPLATE/750-RFC.yml
@@ -0,0 +1,56 @@
 name: 💬 Request for comments (RFC).
 description: Ask for feedback on major architectural changes or design choices.
 title: "[RFC]: "
 labels: ["RFC"]
 body:
 - type: markdown
  attributes:
    value: >
      #### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference.
 - type: textarea
  attributes:
    label: Motivation.
    description: >
      The motivation of the RFC.
  validations:
    required: true
 - type: textarea
  attributes:
    label: Proposed Change.
    description: >
      The proposed change of the RFC.
  validations:
    required: true
 - type: textarea
  attributes:
    label: Feedback Period.
    description: >
      The feedback period of the RFC. Usually at least one week.
  validations:
    required: false
 - type: textarea
  attributes:
    label: CC List.
    description: >
      The list of people you want to CC.
  validations:
    required: false
 - type: textarea
  attributes:
    label: Any Other Things.
    description: >
      Any other things you would like to mention.
  validations:
    required: false
 - type: markdown
  attributes:
    value: >
      Thanks for contributing 🎉!
 - type: checkboxes
  id: askllm
  attributes:
    label: Before submitting a new issue...
    options:
      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
        required: true
--- a/.github/ISSUE_TEMPLATE/800-misc
+++ b/.github/ISSUE_TEMPLATE/800-misc
@@ -1,21 +0,0 @@
 name: 🎲 Misc/random discussions that do not fit into the above categories.
 description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
 title: "[Misc]: "
 labels: ["misc"]
 body:
 - type: markdown
  attributes:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
 - type: textarea
  attributes:
    label: Anything you want to discuss about vllm.
    description: >
      Anything you want to discuss about vllm.
  validations:
    required: true
 - type: markdown
  attributes:
    value: >
      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1 +1,5 @@
 blank_issues_enabled: false
 contact_links:
  - name: Questions
    url: https://discuss.vllm.ai
    about: Ask questions and discuss with other vLLM community members
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -2,63 +2,5 @@ FILL IN THE PR DESCRIPTION HERE
 FIX #xxxx (*link existing issues this PR will resolve*)
-**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**
+<!--- pyml disable-next-line no-emphasis-as-heading -->
-
+**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
 ---
 <details>
 <!-- inside this <details> section, markdown rendering does not work, so we use raw html here. -->
 <summary><b> PR Checklist (Click to Expand) </b></summary>
 <p>Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.</p>
 <h3>PR Title and Classification</h3>
 <p>Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:</p>
 <ul>
    <li><code>[Bugfix]</code> for bug fixes.</li>
    <li><code>[CI/Build]</code> for build or continuous integration improvements.</li>
    <li><code>[Doc]</code> for documentation fixes and improvements.</li>
    <li><code>[Model]</code> for adding a new model or improving an existing model. Model name should appear in the title.</li>
    <li><code>[Frontend]</code> For changes on the vLLM frontend (e.g., OpenAI API server, <code>LLM</code> class, etc.) </li>
    <li><code>[Kernel]</code> for changes affecting CUDA kernels or other compute kernels.</li>
    <li><code>[Core]</code> for changes in the core vLLM logic (e.g., <code>LLMEngine</code>, <code>AsyncLLMEngine</code>, <code>Scheduler</code>, etc.)</li>
    <li><code>[Hardware][Vendor]</code> for hardware-specific changes. Vendor name should appear in the prefix (e.g., <code>[Hardware][AMD]</code>).</li>
    <li><code>[Misc]</code> for PRs that do not fit the above categories. Please use this sparingly.</li>
 </ul>
 <p><strong>Note:</strong> If the PR spans more than one category, please include all relevant prefixes.</p>
 <h3>Code Quality</h3>
 <p>The PR need to meet the following code quality standards:</p>
 <ul>
    <li>We adhere to <a href="https://google.github.io/styleguide/pyguide.html">Google Python style guide</a> and <a href="https://google.github.io/styleguide/cppguide.html">Google C++ style guide</a>.</li>
    <li>Pass all linter checks. Please use <a href="https://github.com/vllm-project/vllm/blob/main/format.sh"><code>format.sh</code></a> to format your code.</li>
    <li>The code need to be well-documented to ensure future contributors can easily understand the code.</li>
    <li>Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.</li>
    <li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
 </ul>
 <h3>Notes for Large Changes</h3>
 <p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
 <h3>What to Expect for the Reviews</h3>
 <p>The goal of the vLLM team is to be a <i>transparent reviewing machine</i>. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process: </p>
 <ul>
    <li> After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.</li>
    <li> After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.</li>
    <li> After the review, the reviewer will put an <code> action-required</code> label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.</li>
    <li> Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.
 </li>
 </ul>
 <h3>Thank You</h3>
 <p> Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone! </p>
 </details>
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -0,0 +1,31 @@
 version: 2
 updates:
  # Maintain dependencies for GitHub Actions
  - package-ecosystem: "github-actions"
    directory: "/"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/"
    schedule:
      interval: "weekly"
    labels: ["dependencies"]
    open-pull-requests-limit: 5
    reviewers: ["khluu", "simon-mo"]
    allow:
      - dependency-type: "all"
    ignore:
      - dependency-name: "*"
        update-types: ["version-update:semver-patch"]
      - dependency-name: "torch"
      - dependency-name: "torchvision"
      - dependency-name: "xformers"
      - dependency-name: "lm-format-enforcer"
      - dependency-name: "gguf"
      - dependency-name: "compressed-tensors"
      - dependency-name: "ray[cgraph]" # Ray Compiled Graph
      - dependency-name: "lm-eval"
    groups:
      minor-update:
        applies-to: version-updates
        update-types: ["minor"]
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -0,0 +1,182 @@
 pull_request_rules:
 - name: label-documentation
  description: Automatically apply documentation label
  conditions:
    - or:
      - files~=^[^/]+\.md$
      - files~=^docs/
      - files~=^examples/
  actions:
    label:
      add:
        - documentation
 - name: label-ci-build
  description: Automatically apply ci/build label
  conditions:
    - or:
      - files~=^\.github/
      - files~=\.buildkite/
      - files~=^cmake/
      - files=CMakeLists.txt
      - files~=^docker/Dockerfile
      - files~=^requirements.*\.txt
      - files=setup.py
  actions:
    label:
      add:
        - ci/build
 - name: label-frontend
  description: Automatically apply frontend label
  conditions:
    - files~=^vllm/entrypoints/
  actions:
    label:
      add:
        - frontend
 - name: label-multi-modality
  description: Automatically apply multi-modality label
  conditions:
    - or:
      - files~=^vllm/multimodal/
      - files~=^tests/multimodal/
      - files~=^tests/models/multimodal/
      - files~=^tests/models/*/audio_language/
      - files~=^tests/models/*/vision_language/
      - files=tests/models/test_vision.py
  actions:
    label:
      add:
        - multi-modality
 - name: label-structured-output
  description: Automatically apply structured-output label
  conditions:
    - or:
      - files~=^benchmarks/structured_schemas/
      - files=benchmarks/benchmark_serving_structured_output.py
      - files=benchmarks/run_structured_output_benchmark.sh
      - files=docs/features/structured_outputs.md
      - files=examples/offline_inference/structured_outputs.py
      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
      - files~=^vllm/model_executor/guided_decoding/
      - files=tests/model_executor/test_guided_processors.py
      - files=tests/entrypoints/llm/test_guided_generate.py
      - files~=^tests/v1/structured_output/
      - files=tests/v1/entrypoints/llm/test_guided_generate.py
      - files~=^vllm/v1/structured_output/
  actions:
    label:
      add:
        - structured-output
 - name: label-speculative-decoding
  description: Automatically apply speculative-decoding label
  conditions:
    - or:
      - files~=^vllm/spec_decode/
      - files=vllm/model_executor/layers/spec_decode_base_sampler.py
      - files~=^tests/spec_decode/
  actions:
    label:
      add:
        - speculative-decoding
 - name: label-v1
  description: Automatically apply v1 label
  conditions:
    - or:
      - files~=^vllm/v1/
      - files~=^tests/v1/
  actions:
    label:
      add:
        - v1
 - name: label-tpu
  description: Automatically apply tpu label
  # Keep this list in sync with `label-tpu-remove` conditions
  conditions:
    - or:
      - files~=tpu.py
      - files~=_tpu
      - files~=tpu_
      - files~=/tpu/
      - files~=pallas
  actions:
    label:
      add:
        - tpu
 - name: label-tpu-remove
  description: Automatically remove tpu label
  # Keep this list in sync with `label-tpu` conditions
  conditions:
    - and:
      - -files~=tpu.py
      - -files~=_tpu
      - -files~=tpu_
      - -files~=/tpu/
      - -files~=pallas
  actions:
    label:
      remove:
        - tpu
 - name: label-tool-calling
  description: Automatically add tool-calling label
  conditions:
    - or:
      - files~=^tests/tool_use/
      - files~=^tests/mistral_tool_use/
      - files~=^tests/entrypoints/openai/tool_parsers/
      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
      - files~=^vllm/entrypoints/openai/tool_parsers/
      - files=docs/features/tool_calling.md
      - files~=^examples/tool_chat_*
      - files=examples/offline_inference/chat_with_tools.py
      - files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
      - files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
      - files=examples/online_serving/openai_chat_completion_client_with_tools.py
  actions:
    label:
      add:
        - tool-calling
 - name: ping author on conflicts and add 'needs-rebase' label
  conditions:
      - conflict
      - -closed
  actions:
    label:
      add:
        - needs-rebase
    comment:
      message: |
       This pull request has merge conflicts that must be resolved before it can be
       merged. Please rebase the PR, @{{author}}.
       https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
 - name: assign reviewer for tensorizer changes
  conditions:
      - files~=^vllm/model_executor/model_loader/tensorizer.py
      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
      - files~=^tests/tensorizer_loader/
  actions:
    assign:
      users:
        - "sangstar"
 - name: remove 'needs-rebase' label when conflict is resolved
  conditions:
      - -conflict
      - -closed
  actions:
    label:
      remove:
        - needs-rebase
--- a/.github/scripts/cleanup_pr_body.sh
+++ b/.github/scripts/cleanup_pr_body.sh
@@ -0,0 +1,50 @@
 #!/bin/bash
 set -eu
 # ensure 1 argument is passed
 if [ "$#" -ne 1 ]; then
    echo "Usage: $0 <pr_number>"
    exit 1
 fi
 PR_NUMBER=$1
 OLD=/tmp/orig_pr_body.txt
 NEW=/tmp/new_pr_body.txt
 gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
 cp "${OLD}" "${NEW}"
 # Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
 sed -i '/FIX #xxxx.*$/d' "${NEW}"
 # Remove "FILL IN THE PR DESCRIPTION HERE"
 sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
 # Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
 sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
 # Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
 python3 - <<EOF
 import regex as re
 with open("${NEW}", "r") as file:
    content = file.read()
 pattern = re.compile(r'(---\n\n)?<details>.*?<summary>.*?PR Checklist \(Click to Expand\).*?</summary>.*?</details>', re.DOTALL)
 content = re.sub(pattern, '', content)
 with open("${NEW}", "w") as file:
    file.write(content)
 EOF
 # Run this only if ${NEW} is different than ${OLD}
 if ! cmp -s "${OLD}" "${NEW}"; then
    gh pr edit --body-file "${NEW}" "${PR_NUMBER}"
    echo
    echo "Updated PR body:"
    echo
    cat "${NEW}"
 else
    echo "No changes needed"
 fi
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@@ -0,0 +1,23 @@
 name: Add label on auto-merge enabled
 permissions:
    pull-requests: write
 on:
    pull_request_target:
        types:
            - auto_merge_enabled
 jobs:
    add-label-on-auto-merge:
        runs-on: ubuntu-latest
        steps:
            -   name: Add label
                uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
                with:
                    script: |
                        github.rest.issues.addLabels({
                            owner: context.repo.owner,
                            repo: context.repo.repo,
                            issue_number: context.issue.number,
                            labels: ['ready']
                        })
                env:
                    GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -0,0 +1,31 @@
 name: Cleanup PR Body
 on:
  pull_request_target:
    types: [opened, reopened, edited]
 permissions:
  pull-requests: write
 jobs:
  update-description:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Set up Python
        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
        with:
          python-version: '3.12'
      - name: Install Python dependencies
        run: |
          python3 -m pip install --upgrade pip
          python3 -m pip install regex
      - name: Update PR description
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -0,0 +1,85 @@
 name: Lint and Deploy Charts
 on: pull_request
 permissions:
  contents: read
 jobs:
  lint-and-deploy:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
      - name: Set up Helm
        uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
        with:
          version: v3.14.4
       #Python is required because ct lint runs Yamale and yamllint which require Python.
      - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
        with:
          python-version: '3.13'
      - name: Set up chart-testing
        uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0
        with:
          version: v3.10.1
      - name: Run chart-testing (lint)
        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
      - name: Setup minio
        run: |
          docker network create vllm-net
          docker run -d -p 9000:9000 --name minio --net vllm-net \
                     -e "MINIO_ACCESS_KEY=minioadmin" \
                     -e "MINIO_SECRET_KEY=minioadmin" \
                     -v /tmp/data:/data \
                     -v /tmp/config:/root/.minio \
                     minio/minio server /data
          export AWS_ACCESS_KEY_ID=minioadmin
          export AWS_SECRET_ACCESS_KEY=minioadmin
          export AWS_EC2_METADATA_DISABLED=true
          mkdir opt-125m
          cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
          aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
          aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
      - name: Create kind cluster
        uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
      - name: Build the Docker image vllm cpu
        run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
      - name: Configuration of docker images, network and namespace for the kind cluster
        run: |
          docker pull amazon/aws-cli:2.6.4
          kind load docker-image  amazon/aws-cli:2.6.4 --name chart-testing
          kind load docker-image vllm-cpu-env:latest --name chart-testing
          docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
          kubectl create ns ns-vllm
      - name: Run chart-testing (install)
        run: |
          export AWS_ACCESS_KEY_ID=minioadmin
          export AWS_SECRET_ACCESS_KEY=minioadmin
          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
      - name: curl test
        run: |
          kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
          sleep 10
          CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
                  --header "Content-Type: application/json" \
                  --data '{
                          "model": "opt-125m",
                          "prompt": "San Francisco is a",
                          "max_tokens": 7,
                          "temperature": 0
                  }'):$CODE"
          echo "$CODE"
--- a/.github/workflows/matchers/actionlint.json
+++ b/.github/workflows/matchers/actionlint.json
@@ -0,0 +1,17 @@
 {
  "problemMatcher": [
    {
      "owner": "actionlint",
      "pattern": [
        {
          "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
          "file": 1,
          "line": 2,
          "column": 3,
          "message": 4,
          "code": 5
        }
      ]
    }
  ]
 }
--- a/.github/workflows/matchers/mypy.json
+++ b/.github/workflows/matchers/mypy.json
@@ -0,0 +1,16 @@
 {
  "problemMatcher": [
    {
      "owner": "mypy",
      "pattern": [
        {
          "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$",
          "file": 1,
          "line": 2,
          "severity": 3,
          "message": 4
        }
      ]
    }
  ]
 }
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -0,0 +1,23 @@
 name: pre-commit
 on:
  pull_request:
  push:
    branches: [main]
 permissions:
  contents: read
 jobs:
  pre-commit:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
    - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
      with:
        python-version: "3.12"
    - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
    - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
    - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
      with:
        extra_args: --all-files --hook-stage manual
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -21,16 +21,16 @@ jobs:
      upload_url: ${{ steps.create_release.outputs.upload_url }}
    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Extract branch info
        shell: bash
        run: |
-          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
+          echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"
      - name: Create Release
        id: create_release
-        uses: "actions/github-script@v6"
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
        env:
          RELEASE_TAG: ${{ env.release_tag }}
        with:
@@ -39,59 +39,68 @@ jobs:
            const script = require('.github/workflows/scripts/create_release.js')
            await script(github, context, core)
-  wheel:
+  # NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow. 
-    name: Build Wheel
+  # wheel:
-    runs-on: ${{ matrix.os }}
+  #   name: Build Wheel
-    needs: release
+  #   runs-on: ${{ matrix.os }}
  #   needs: release
-    strategy:
+  #   strategy:
-      fail-fast: false
+  #     fail-fast: false
-      matrix:
+  #     matrix:
-          os: ['ubuntu-20.04']
+  #         os: ['ubuntu-20.04']
-          python-version: ['3.8', '3.9', '3.10', '3.11']
+  #         python-version: ['3.9', '3.10', '3.11', '3.12']
-          pytorch-version: ['2.1.2']  # Must be the most recent version that meets requirements.txt.
+  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements/cuda.txt.
-          cuda-version: ['11.8', '12.1']
+  #         cuda-version: ['11.8', '12.1']
-    steps:
+  #   steps:
-      - name: Checkout
+  #     - name: Checkout
-        uses: actions/checkout@v3
+  #       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Set up Linux Env
+  #     - name: Setup ccache
-        if: ${{ runner.os == 'Linux' }}
+  #       uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
-        run: |
+  #       with:
-          bash -x .github/workflows/scripts/env.sh
+  #         create-symlink: true
  #         key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
-      - name: Set up Python
+  #     - name: Set up Linux Env
-        uses: actions/setup-python@v4
+  #       if: ${{ runner.os == 'Linux' }}
-        with:
+  #       run: |
-            python-version: ${{ matrix.python-version }}
+  #         bash -x .github/workflows/scripts/env.sh
-      - name: Install CUDA ${{ matrix.cuda-version }}
+  #     - name: Set up Python
-        run: |
+  #       uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-          bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
+  #       with:
  #           python-version: ${{ matrix.python-version }}
-      - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
+  #     - name: Install CUDA ${{ matrix.cuda-version }}
-        run: |
+  #       run: |
-          bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
+  #         bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
-      - name: Build wheel
+  #     - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
-        shell: bash
+  #       run: |
-        run: |
+  #         bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
          bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
          wheel_name=$(ls dist/*whl | xargs -n 1 basename)
          asset_name=${wheel_name//"linux"/"manylinux1"}
          echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
          echo "asset_name=${asset_name}" >> $GITHUB_ENV
-      - name: Upload Release Asset
+  #     - name: Build wheel
-        uses: actions/upload-release-asset@v1
+  #       shell: bash
-        env:
+  #       env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  #         CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
-        with:
+  #       run: |
-          upload_url: ${{ needs.release.outputs.upload_url }}
+  #         bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
-          asset_path: ./dist/${{ env.wheel_name }}
+  #         wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
-          asset_name: ${{ env.asset_name }}
+  #         asset_name=${wheel_name//"linux"/"manylinux1"}
-          asset_content_type: application/*
+  #         echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
  #         echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
  #     - name: Upload Release Asset
  #       uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
  #       env:
  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  #       with:
  #         upload_url: ${{ needs.release.outputs.upload_url }}
  #         asset_path: ./dist/${{ env.wheel_name }}
  #         asset_name: ${{ env.asset_name }}
  #         asset_content_type: application/*
      # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
      # - name: Publish package
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@@ -0,0 +1,27 @@
 name: PR Reminder Comment Bot
 permissions:
  pull-requests: write
 on:
  pull_request_target:
    types: [opened]
 jobs:
  pr_reminder:
    runs-on: ubuntu-latest
    steps:
      - name: Remind to run full CI on PR
        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
        with:
          script: |
            github.rest.issues.createComment({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: context.issue.number,
              body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
                '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
                'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
                'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
                'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
                '🚀'
            })
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1,2 @@`
							`github: [vllm-project]`
							`open_collective: vllm`