[CI] Fix Plugin Tests Tests (#28413 )

Signed-off-by: Robert Shaw <robertgshaw2@gmail.com>
[Frontend][2/n] remove empty content from _parse_tool_calls_from_content (#28331 )
2025-11-10 22:36:11 +00:00 · 2025-11-10 14:07:49 -08:00 · 2025-11-10 17:01:17 -05:00 · 2025-11-10 21:03:21 +00:00 · 2025-11-10 20:39:19 +00:00 · 2025-11-10 12:06:04 -08:00
878 changed files with 33469 additions and 13439 deletions
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
@@ -1,12 +0,0 @@
-# For vllm script, with -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
-model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.595
-  - name: "exact_match,flexible-extract"
-    value: 0.582
-limit: 1000
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
@@ -0,0 +1,14 @@
+model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
+tasks:
+  - name: "mmlu_pro"
+    metrics:
+      - name: "exact_match,custom-extract"
+        value: 0.82
+limit: 250 # will run on 250 * 14 subjects = 3500 samples
+num_fewshot: 5
+enforce_eager: false # we use false to speed up the eval process
+kv_cache_dtype: fp8 # we use fp8 to speed up the eval process
+max_model_len: 40960
+apply_chat_template: true
+fewshot_as_multiturn: true
+gen_kwargs: "temperature=0,top_p=1,top_k=0,max_gen_toks=5632,until=<|ENDANSWER|>"
--- a/.buildkite/lm-eval-harness/configs/models-large-h100.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-h100.txt
@@ -1 +0,0 @@
-Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
--- a/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
@@ -0,0 +1 @@
+Qwen3-235B-A22B-Instruct-2507-FP8.yaml
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -21,10 +21,13 @@ def launch_lm_eval(eval_config, tp_size):
    max_model_len = eval_config.get("max_model_len", 4096)
    batch_size = eval_config.get("batch_size", "auto")
    backend = eval_config.get("backend", "vllm")
+    enforce_eager = eval_config.get("enforce_eager", "true")
+    kv_cache_dtype = eval_config.get("kv_cache_dtype", "auto")
    model_args = (
        f"pretrained={eval_config['model_name']},"
        f"tensor_parallel_size={tp_size},"
-        f"enforce_eager=true,"
+        f"enforce_eager={enforce_eager},"
+        f"kv_cache_dtype={kv_cache_dtype},"
        f"add_bos_token=true,"
        f"trust_remote_code={trust_remote_code},"
        f"max_model_len={max_model_len},"
@@ -37,8 +40,13 @@ def launch_lm_eval(eval_config, tp_size):
        limit=eval_config["limit"],
        # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
        # text models. however, this is regressing measured strict-match for
-        # existing text models in CI, so only apply it for mm.
-        apply_chat_template=backend == "vllm-vlm",
+        # existing text models in CI, so only apply it for mm, or explicitly set
+        apply_chat_template=eval_config.get(
+            "apply_chat_template", backend == "vllm-vlm"
+        ),
+        fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
+        # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
+        gen_kwargs=eval_config.get("gen_kwargs"),
        batch_size=batch_size,
    )
    return results
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -1,184 +0,0 @@
-steps:
-  - label: "Wait for container to be ready"
-    key: wait-for-container-image
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          containers:
-          - image: badouralix/curl-jq
-            command:
-            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
-  - label: "Cleanup H100"
-    agents:
-      queue: H100
-    depends_on: ~
-    command: docker system prune -a --volumes --force
-  
-  - label: "A100"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: A100
-    depends_on: wait-for-container-image
-    if: build.branch == "main"
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
-            command:
-            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-
-  - label: "H200"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: H200
-    depends_on: wait-for-container-image
-    if: build.branch == "main"
-    plugins:
-    - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
-        command:
-        - bash
-        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-        mount-buildkite-agent: true
-        propagate-environment: true
-        ipc: host
-        gpus: 4,5,6,7
-        volumes:
-          - /data/benchmark-hf-cache:/root/.cache/huggingface
-        environment:
-        - VLLM_USAGE_SOURCE
-        - HF_TOKEN
-
-  #- block: "Run H100 Benchmark"
-    #key: block-h100
-    #depends_on: ~
-
-  - label: "H100"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: H100
-    depends_on: wait-for-container-image
-    if: build.branch == "main"
-    plugins:
-    - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
-        command:
-        - bash
-        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-        mount-buildkite-agent: true
-        propagate-environment: true
-        ipc: host
-        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
-        volumes:
-          - /data/benchmark-hf-cache:/root/.cache/huggingface
-        environment:
-        - VLLM_USAGE_SOURCE
-        - HF_TOKEN
-
-  # Premerge benchmark
-  - label: "A100"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: A100
-    depends_on: wait-for-container-image
-    if: build.branch != "main"
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-            command:
-            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-
-  - label: "H200"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: H200
-    depends_on: wait-for-container-image
-    if: build.branch != "main"
-    plugins:
-    - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-        command:
-        - bash
-        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-        mount-buildkite-agent: true
-        propagate-environment: true
-        ipc: host
-        gpus: 4,5,6,7
-        volumes:
-          - /data/benchmark-hf-cache:/root/.cache/huggingface
-        environment:
-        - VLLM_USAGE_SOURCE
-        - HF_TOKEN
-
-  #- block: "Run H100 Benchmark"
-    #key: block-h100
-    #depends_on: ~
-
-  - label: "H100"
-    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
-    agents:
-      queue: H100
-    depends_on: wait-for-container-image
-    if: build.branch != "main"
-    plugins:
-    - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-        command:
-        - bash
-        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
-        mount-buildkite-agent: true
-        propagate-environment: true
-        ipc: host
-        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
-        volumes:
-          - /data/benchmark-hf-cache:/root/.cache/huggingface
-        environment:
-        - VLLM_USAGE_SOURCE
-        - HF_TOKEN
--- a/.buildkite/nightly-benchmarks/nightly-annotation.md
+++ b/.buildkite/nightly-benchmarks/nightly-annotation.md
@@ -1,28 +0,0 @@
-# Nightly benchmark annotation
-
-## Description
-
-This file contains the downloading link for benchmarking results.
-
- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
- [benchmarking results](artifact://results.zip)
- [benchmarking code](artifact://nightly-benchmarks.zip)
-
-Please download the visualization scripts in the post
-
-## Results reproduction
-
- Find the docker we use in `benchmarking pipeline`
- Deploy the docker, and inside the docker:
-    - Download `nightly-benchmarks.zip`.
-    - In the same folder, run the following code:
-
-    ```bash
-    export HF_TOKEN=<your HF token>
-    apt update
-    apt install -y git
-    unzip nightly-benchmarks.zip
-    VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
-    ```
-
-And the results will be inside `./benchmarks/results`.
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -1,39 +0,0 @@
-
-# Nightly benchmark
-
-This benchmark aims to:
-
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
-
-Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
-
-Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
-
-## Setup
-
- Docker images:
-    - vLLM: `vllm/vllm-openai:v0.6.2`
-    - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
-    - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
-    - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
-        - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
-    - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
- Hardware
-    - 8x Nvidia A100 GPUs
- Workload:
-    - Dataset
-        - ShareGPT dataset
-        - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
-        - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
-        - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
-    - Models: llama-3 8B, llama-3 70B.
-        - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
-    - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
-        - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
-    - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
-
-## Known issues
-
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
- TGI does not support `ignore-eos` flag.
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -1,196 +0,0 @@
-common_pod_spec: &common_pod_spec
-  priorityClassName: perf-benchmark
-  nodeSelector:
-    nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  volumes:
-    - name: devshm
-      emptyDir:
-        medium: Memory
-    - name: hf-cache
-      hostPath:
-        path: /root/.cache/huggingface
-        type: Directory
-
-common_container_settings: &common_container_settings
-  command:
-    - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
-  resources:
-    limits:
-      nvidia.com/gpu: 8
-  volumeMounts:
-    - name: devshm
-      mountPath: /dev/shm
-    - name: hf-cache
-      mountPath: /root/.cache/huggingface
-  env:
-    - name: VLLM_USAGE_SOURCE
-      value: ci-test
-    - name: HF_HOME
-      value: /root/.cache/huggingface
-    - name: VLLM_SOURCE_CODE_LOC
-      value: /workspace/build/buildkite/vllm/performance-benchmark
-    - name: HF_TOKEN
-      valueFrom:
-        secretKeyRef:
-          name: hf-token-secret
-          key: token
-
-steps:
-  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
-
-
-
-  - label: "A100 vllm step 10"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: vllm/vllm-openai:v0.6.2
-                <<: *common_container_settings
-
-
-
-  - label: "A100 sglang benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: lmsysorg/sglang:v0.3.2-cu121
-                <<: *common_container_settings
-
-  - label: "A100 lmdeploy benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: openmmlab/lmdeploy:v0.6.1-cu12
-                <<: *common_container_settings
-
-
-
-
-  - label: "A100 trt llama-8B"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
-                <<: *common_container_settings
-                env:
-                  - name: VLLM_USAGE_SOURCE
-                    value: ci-test
-                  - name: HF_HOME
-                    value: /root/.cache/huggingface
-                  - name: VLLM_SOURCE_CODE_LOC
-                    value: /workspace/build/buildkite/vllm/performance-benchmark
-                  - name: HF_TOKEN
-                    valueFrom:
-                      secretKeyRef:
-                        name: hf-token-secret
-                        key: token
-                  - name: TEST_SELECTOR
-                    value: "llama8B"
-
-
-  - label: "A100 trt llama-70B"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
-                <<: *common_container_settings
-                env:
-                  - name: VLLM_USAGE_SOURCE
-                    value: ci-test
-                  - name: HF_HOME
-                    value: /root/.cache/huggingface
-                  - name: VLLM_SOURCE_CODE_LOC
-                    value: /workspace/build/buildkite/vllm/performance-benchmark
-                  - name: HF_TOKEN
-                    valueFrom:
-                      secretKeyRef:
-                        name: hf-token-secret
-                        key: token
-                  - name: TEST_SELECTOR
-                    value: "llama70B"
-
-
-  # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image 
-  # - label: "A100 trt benchmark"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #     - kubernetes:
-  #         podSpec:
-  #           <<: *common_pod_spec
-  #           containers:
-  #             - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
-  #               <<: *common_container_settings
-
-
-  # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
-  # - label: "A100 tgi benchmark"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #     - kubernetes:
-  #         podSpec:
-  #           <<: *common_pod_spec
-  #           containers:
-  #             - image: ghcr.io/huggingface/text-generation-inference:2.2.0
-  #               <<: *common_container_settings
-        
-  - wait
-
-  - label: "Collect the results"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-      - kubernetes:
-          podSpec:
-            <<: *common_pod_spec
-            containers:
-            - image: vllm/vllm-openai:v0.5.0.post1
-              command:
-              - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
-              resources:
-                limits:
-                  nvidia.com/gpu: 8
-              volumeMounts:
-              - name: devshm
-                mountPath: /dev/shm
-              env:
-              - name: VLLM_USAGE_SOURCE
-                value: ci-test
-              - name: VLLM_SOURCE_CODE_LOC
-                value: /workspace/build/buildkite/vllm/performance-benchmark
-              - name: HF_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-token-secret
-                    key: token
-
-  - block: ":rocket: check the results!"
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@@ -1,26 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-
-from transformers import AutoTokenizer
-
-
-def main(model, cachedir):
-    # Load the tokenizer and save it to the specified directory
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    tokenizer.save_pretrained(cachedir)
-    print(f"Tokenizer saved to {cachedir}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Download and save Hugging Face tokenizer"
-    )
-    parser.add_argument("--model", type=str, required=True, help="Name of the model")
-    parser.add_argument(
-        "--cachedir", type=str, required=True, help="Directory to save the tokenizer"
-    )
-
-    args = parser.parse_args()
-    main(args.model, args.cachedir)
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@@ -1,97 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import json
-from pathlib import Path
-
-import numpy as np
-import pandas as pd
-from tabulate import tabulate
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser(
-        description="Parse command line arguments for summary-nightly-results script."
-    )
-    parser.add_argument(
-        "--results-folder",
-        type=str,
-        required=True,
-        help="The folder where the results are stored.",
-    )
-    parser.add_argument(
-        "--description", type=str, required=True, help="Description of the results."
-    )
-
-    args = parser.parse_args()
-    return args
-
-
-def get_perf(df, method, model, metric):
-    means = []
-
-    for qps in [2, 4, 8, 16, "inf"]:
-        target = df["Test name"].str.contains(model)
-        target = target & df["Engine"].str.contains(method)
-        target = target & df["Test name"].str.contains("qps_" + str(qps))
-        filtered_df = df[target]
-
-        if filtered_df.empty:
-            means.append(0.0)
-        else:
-            means.append(filtered_df[metric].values[0])
-
-    return np.array(means)
-
-
-def get_perf_w_std(df, method, model, metric):
-    if metric in ["TTFT", "ITL"]:
-        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
-        mean = mean.tolist()
-        std = get_perf(df, method, model, "Std " + metric + " (ms)")
-        if std.mean() == 0:
-            std = None
-        success = get_perf(df, method, model, "Successful req.")
-        if std is not None:
-            std = std / np.sqrt(success)
-            std = std.tolist()
-
-    else:
-        assert metric == "Tput"
-        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
-            df, method, model, "Output Tput (tok/s)"
-        )
-        mean = mean.tolist()
-        std = None
-
-    return mean, std
-
-
-def main(args):
-    results_folder = Path(args.results_folder)
-
-    results = []
-
-    # collect results
-    for test_file in results_folder.glob("*_nightly_results.json"):
-        with open(test_file) as f:
-            results = results + json.loads(f.read())
-
-    # generate markdown table
-    df = pd.DataFrame.from_dict(results)
-
-    md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
-
-    with open(args.description) as f:
-        description = f.read()
-
-    description = description.format(nightly_results_benchmarking_table=md_table)
-
-    with open("nightly_results.md", "w") as f:
-        f.write(description)
-
-
-if __name__ == "__main__":
-    args = parse_arguments()
-    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@@ -1,9 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from lmdeploy.serve.openai.api_client import APIClient
-
-api_client = APIClient("http://localhost:8000")
-model_name = api_client.available_models[0]
-
-print(model_name)
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -1,78 +0,0 @@
-#!/bin/bash
-
-set -ex
-set -o pipefail
-
-
-main() {
-
-    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
-    (which jq) || (apt-get update && apt-get -y install jq)
-    (which zip) || (apt-get install -y zip)
-
-    if [ ! -f /workspace/buildkite-agent ]; then
-        echo "buildkite-agent binary not found. Skip plotting the results."
-        exit 0
-    fi
-
-    # initial annotation
-    #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
-
-    # download results
-    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-    mkdir -p results/
-    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
-    ls
-    ls results/
-
-    # upload benchmark results
-    zip -r results.zip results/
-    /workspace/buildkite-agent artifact upload "results.zip"
-
-    # upload benchmarking scripts
-    cd "$VLLM_SOURCE_CODE_LOC/"
-    zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
-    /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
-
-    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
-    # upload benchmarking pipeline
-    /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
-
-    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
-    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
-    
-
-
-    # The figures should be generated by a separate process outside the CI/CD pipeline
-
-    # # generate figures
-    # python3 -m pip install tabulate pandas matplotlib
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
-    #     --description $description \
-    #     --results-folder results/ 
-
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
-    #     --description $description \
-    #     --results-folder results/ \
-    #     --dataset sharegpt
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
-    #     --description $description \
-    #     --results-folder results/ \
-    #     --dataset sonnet_2048_128
-
-    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
-    #     --description $description \
-    #     --results-folder results/ \
-    #     --dataset sonnet_128_2048
-    
-    # # upload results and figures
-    # /workspace/buildkite-agent artifact upload "nightly_results*.png"
-    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
-    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
-    # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -1,464 +0,0 @@
-#!/bin/bash
-
-set -o pipefail
-set -x
-
-check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  if [[ $gpu_count -gt 0 ]]; then
-    echo "GPU found."
-  else
-    echo "Need at least 1 GPU to run benchmarking."
-    exit 1
-  fi
-  declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
-  echo "GPU type is $gpu_type"
-}
-
-check_hf_token() {
-  # check if HF_TOKEN is available and valid
-  if [[ -z "$HF_TOKEN" ]]; then
-    echo "Error: HF_TOKEN is not set."
-    exit 1
-  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
-    echo "Error: HF_TOKEN does not start with 'hf_'."
-    exit 1
-  else
-    echo "HF_TOKEN is set and valid."
-  fi
-}
-
-
-upload_to_buildkite() {
-  # upload the benchmarking results to buildkite
-
-  # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
-    echo "buildkite-agent binary not found. Skip uploading the results."
-    return 0
-  fi
-  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
-}
-
-
-get_current_llm_serving_engine() {
-
-  if which lmdeploy >/dev/null; then
-    echo "Container: lmdeploy"
-    export CURRENT_LLM_SERVING_ENGINE=lmdeploy
-    return
-  fi
-
-  if [ -e /tgi-entrypoint.sh ]; then
-    echo "Container: tgi"
-    export CURRENT_LLM_SERVING_ENGINE=tgi
-    return
-  fi
-
-  if which trtllm-build >/dev/null; then
-    echo "Container: tensorrt-llm"
-    export CURRENT_LLM_SERVING_ENGINE=trt
-    return
-  fi
-
-  if [ -e /sgl-workspace ]; then
-    echo "Container: sglang"
-    export CURRENT_LLM_SERVING_ENGINE=sglang
-    return
-  fi
-
-  if [ -e /vllm-workspace ]; then
-    echo "Container: vllm"
-    # move to a completely irrelevant directory, to avoid import vllm from current folder
-    export CURRENT_LLM_SERVING_ENGINE=vllm
-
-    return
-  fi
-}
-
-json2args() {
-  # transforms the JSON string to command line args, and '_' is replaced to '-'
-  # example:
-  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
-  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-kill_gpu_processes() {
-  pkill -f '[p]ython'
-  pkill -f '[p]ython3'
-  pkill -f '[t]ritonserver'
-  pkill -f '[p]t_main_thread'
-  pkill -f '[t]ext-generation'
-  pkill -f '[l]mdeploy'
-  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
-  pkill -f '[V]LLM'
-
-  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
-    sleep 1
-  done
-}
-
-wait_for_server() {
-  # wait for vllm server to start
-  # return 1 if vllm server crashes
-  timeout 1200 bash -c '
-    until curl -s localhost:8000/v1/completions > /dev/null; do
-      sleep 1
-    done' && return 0 || return 1
-}
-
-ensure_installed() {
-  # Ensure that the given command is installed by apt-get
-  local cmd=$1
-  if ! which "$cmd" >/dev/null; then
-    apt-get update && apt-get install -y "$cmd"
-  fi
-}
-
-run_serving_tests() {
-  # run serving tests using `vllm bench serve` command
-  # $1: a json file specifying serving test cases
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # prepend the current serving engine to the test name
-    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
-
-    # get common parameters
-    common_params=$(echo "$params" | jq -r '.common_parameters')
-    model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tp')
-    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-    port=$(echo "$common_params" | jq -r '.port')
-    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
-    client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
-    client_args=$(json2args "$client_params")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
-    fi
-
-    if [[ $reuse_server == "true" ]]; then
-      echo "Reuse previous server for test case $test_name"
-    else
-      kill_gpu_processes
-      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
-        "$server_params" "$common_params"
-    fi
-
-    if wait_for_server; then
-      echo ""
-      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
-    else
-      echo ""
-      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
-      break
-    fi
-
-    # prepare tokenizer
-    # this is required for lmdeploy.
-    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-    rm -rf /tokenizer_cache
-    mkdir /tokenizer_cache
-    python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
-      --model "$model" \
-      --cachedir /tokenizer_cache
-    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-
-
-    # change model name for lmdeploy (it will not follow standard hf name)
-    if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
-      model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
-    fi
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps="inf"
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-
-      backend=$CURRENT_LLM_SERVING_ENGINE
-
-      if [[ $backend = "trt" ]]; then
-        backend="tensorrt-llm"
-      fi
-
-      if [[ "$backend" == *"vllm"* ]]; then
-        backend="vllm"
-      fi
-
-      if [[ "$dataset_name" = "sharegpt" ]]; then
-
-        client_command="vllm bench serve \
-          --backend $backend \
-          --tokenizer /tokenizer_cache \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --num-prompts $num_prompts \
-          --port $port \
-          --save-result \
-          --result-dir $RESULTS_FOLDER \
-          --result-filename ${new_test_name}.json \
-          --request-rate $qps \
-          --ignore-eos \
-          $client_args"
-
-      elif [[ "$dataset_name" = "sonnet" ]]; then
-
-        sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
-        sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
-        sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
-
-        client_command="vllm bench serve \
-          --backend $backend \
-          --tokenizer /tokenizer_cache \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --num-prompts $num_prompts \
-          --sonnet-input-len $sonnet_input_len \
-          --sonnet-output-len $sonnet_output_len \
-          --sonnet-prefix-len $sonnet_prefix_len \
-          --port $port \
-          --save-result \
-          --result-dir $RESULTS_FOLDER \
-          --result-filename ${new_test_name}.json \
-          --request-rate $qps \
-          --ignore-eos \
-          $client_args"
-
-      else
-
-        echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
-        exit 1
-
-      fi
-
-
-
-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
-
-      eval "$client_command"
-
-      server_command="None"
-
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        --arg engine "$CURRENT_LLM_SERVING_ENGINE" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu,
-          engine: $engine
-        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
-
-    done
-
-  done
-
-  kill_gpu_processes
-}
-
-run_genai_perf_tests() {
-  # run genai-perf tests
-
-  # $1: a json file specifying genai-perf test cases
-  local genai_perf_test_file
-  genai_perf_test_file=$1
-
-  # Iterate over genai-perf tests
-  jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # prepend the current serving engine to the test name
-    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
-
-    # get common parameters
-    common_params=$(echo "$params" | jq -r '.common_parameters')
-    model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tp')
-    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
-    port=$(echo "$common_params" | jq -r '.port')
-    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
-    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
-      continue
-    fi
-
-    if [[ $reuse_server == "true" ]]; then
-      echo "Reuse previous server for test case $test_name"
-    else
-      kill_gpu_processes
-      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
-        "$server_params" "$common_params"
-    fi
-
-    if wait_for_server; then
-      echo ""
-      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
-    else
-      echo ""
-      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
-      break
-    fi
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps=$num_prompts
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-      backend=$CURRENT_LLM_SERVING_ENGINE
-
-      if [[ "$backend" == *"vllm"* ]]; then
-        backend="vllm"
-      fi
-      #TODO: add output dir.
-      client_command="genai-perf profile \
-        -m $model \
-        --service-kind openai \
-        --backend "$backend" \
-        --endpoint-type chat \
-        --streaming \
-        --url localhost:$port \
-        --request-rate $qps \
-        --num-prompts $num_prompts \
-      "
-
-    echo "Client command: $client_command"
-
-    eval "$client_command"
-
-    #TODO: process/record outputs
-    done
-  done
-
-  kill_gpu_processes
-
-}
-
-prepare_dataset() {
-
-  # download sharegpt dataset
-  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-  # duplicate sonnet by 4x, to allow benchmarking with input length 2048
-  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-  echo "" > sonnet_4x.txt
-  for _ in {1..4}
-  do
-    cat sonnet.txt >> sonnet_4x.txt
-  done
-
-}
-
-main() {
-
-  # check if the environment variable is successfully injected from yaml
-
-  check_gpus
-  check_hf_token
-  get_current_llm_serving_engine
-
-  pip install -U transformers
-
-  pip install -r requirements/dev.txt
-  which genai-perf
-
-  # check storage
-  df -h
-
-  ensure_installed wget
-  ensure_installed curl
-  ensure_installed jq
-  # genai-perf dependency
-  ensure_installed libb64-0d
-
-  prepare_dataset
-
-  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
-  declare -g RESULTS_FOLDER=results/
-  mkdir -p $RESULTS_FOLDER
-  BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
-
-  # run the test
-  run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
-
-  # run genai-perf tests
-  run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
-  mv artifacts/ $RESULTS_FOLDER/
-
-  # upload benchmark results to buildkite
-  python3 -m pip install tabulate pandas
-  python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
-  upload_to_buildkite
-
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -1,82 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import datetime
-import json
-import os
-from pathlib import Path
-
-import pandas as pd
-from tabulate import tabulate
-
-results_folder = Path("results/")
-
-# serving results and the keys that will be printed into markdown
-serving_results = []
-serving_column_mapping = {
-    "test_name": "Test name",
-    "gpu_type": "GPU",
-    "completed": "Successful req.",
-    "request_throughput": "Tput (req/s)",
-    "mean_ttft_ms": "Mean TTFT (ms)",
-    "std_ttft_ms": "Std TTFT (ms)",
-    "median_ttft_ms": "Median TTFT (ms)",
-    "mean_itl_ms": "Mean ITL (ms)",
-    "std_itl_ms": "Std ITL (ms)",
-    "median_itl_ms": "Median ITL (ms)",
-    "mean_tpot_ms": "Mean TPOT (ms)",
-    "std_tpot_ms": "Std TPOT (ms)",
-    "median_tpot_ms": "Median TPOT (ms)",
-    "total_token_throughput": "Total Token Tput (tok/s)",
-    "output_throughput": "Output Tput (tok/s)",
-    "total_input_tokens": "Total input tokens",
-    "total_output_tokens": "Total output tokens",
-    "engine": "Engine",
-}
-
-if __name__ == "__main__":
-    # collect results
-    for test_file in results_folder.glob("*.json"):
-        with open(test_file) as f:
-            raw_result = json.loads(f.read())
-
-        # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands")) as f:
-            command = json.loads(f.read())
-        raw_result.update(command)
-
-        # update the test name of this result
-        raw_result.update({"test_name": test_file.stem})
-
-        # add the result to raw_result
-        serving_results.append(raw_result)
-        continue
-
-    serving_results = pd.DataFrame.from_dict(serving_results)
-
-    if not serving_results.empty:
-        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
-            columns=serving_column_mapping
-        )
-
-    serving_md_table_with_headers = tabulate(
-        serving_results, headers="keys", tablefmt="pipe", showindex=False
-    )
-    # remove the first line of header
-    serving_md_table_lines = serving_md_table_with_headers.split("\n")
-    serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
-
-    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
-
-    # document benchmarking results in markdown
-    with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
-        # document results with header.
-        # for those who wants to reproduce our benchmark.
-        f.write(serving_md_table_with_headers)
-        f.write("\n")
-
-    # document benchmarking results in json
-    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
-        results = serving_results.to_dict(orient="records")
-        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@@ -1,23 +0,0 @@
-#!/bin/sh
-TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
-if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
-    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
-else
-    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
-fi
-
-TIMEOUT_SECONDS=10
-
-retries=0
-while [ $retries -lt 1000 ]; do
-    if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
-        exit 0
-    fi
-
-    echo "Waiting for image to be available..."
-
-    retries=$((retries + 1))
-    sleep 5
-done
-
-exit 1
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -2,40 +2,23 @@

 ## Introduction

-This directory contains two sets of benchmark for vllm.
-
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
-
-See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
+This directory contains a benchmarking suite for **developers** to run locally and gain clarity on whether their PR improves/degrades vllm's performance.
+vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](https://perf.vllm.ai/), hosted under PyTorch CI HUD.

 ## Performance benchmark quick overview

-**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
+**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.

 **Benchmarking Duration**: about 1hr.

 **For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.

-## Nightly benchmark quick overview
-
-**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
-
-**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
-
-**Benchmarking Duration**: about 3.5hrs.
-
 ## Trigger the benchmark

-Performance benchmark will be triggered when:
-
- A PR being merged into vllm.
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
-
-Manually Trigger the benchmark
+The benchmark needs to be triggered manually:

 ```bash
-bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
 ```

 Runtime environment variables:
@@ -47,14 +30,11 @@ Runtime environment variables:
 - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
 - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.

-Nightly benchmark will be triggered when:
-
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
-
 ## Performance benchmark details

 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
+For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
 >
 ### Latency test

@@ -152,26 +132,3 @@ Here is an example using the script to compare result_a and result_b with Model,
 A comparison diagram will be generated below the table.
 Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
 <img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
-
-## Nightly test details
-
-See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
-
-### Workflow
-
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container.
- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`.
- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
-
-### Nightly tests
-
-In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
-
-### Docker containers
-
-The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
-
-WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`.
-
-WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
--- a/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md
@@ -5,7 +5,7 @@
 - Input length: 32 tokens.
 - Output length: 128 tokens.
 - Batch size: fixed (8).
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - CPU Models: llama-3.1 8B.
 - Evaluation metrics: end-to-end latency (mean, median, p99).

@@ -16,7 +16,7 @@
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm to achieve maximum throughput.
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - CPU Models: llama-3.1 8B.
 - Evaluation metrics: throughput.

@@ -28,7 +28,7 @@
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
 - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
 - CPU Models: llama-3.1 8B.
 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
--- a/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -392,7 +392,7 @@ if __name__ == "__main__":
    json_file = "benchmark_results.json"
    with open(results_folder / md_file, "w") as f:
        results = read_markdown(
-            "../.buildkite/nightly-benchmarks/"
+            "../.buildkite/performance-benchmarks/"
            + "performance-benchmarks-descriptions.md"
        )
        results = results.format(
--- a/.buildkite/performance-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/performance-benchmarks/scripts/launch-server.sh
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -15,6 +15,8 @@ check_gpus() {
    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  elif command -v amd-smi; then
    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
+  elif command -v hl-smi; then
+    declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
  fi

  if [[ $gpu_count -gt 0 ]]; then
@@ -23,10 +25,16 @@ check_gpus() {
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
+  
+  declare -g arch_suffix=''
+  
  if command -v nvidia-smi; then
    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
  elif command -v amd-smi; then
    declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
+  elif command -v hl-smi; then
+    declare -g gpu_type=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
+    arch_suffix='-hpu'
  fi
  echo "GPU type is $gpu_type"
 }
@@ -138,6 +146,10 @@ kill_gpu_processes() {
    while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
      sleep 1
    done
+  elif command -v hl-smi; then
+    while [ "$(hl-smi -q | grep "Used" | head -n 1 | awk '{print $3}')" -ge 1000 ]; do
+      sleep 1
+    done
  fi

  # remove vllm config file
@@ -451,6 +463,7 @@ main() {
     ARCH='-cpu'
  else
     check_gpus
+     ARCH="$arch_suffix"
  fi
  check_hf_token

@@ -469,7 +482,7 @@ main() {
  ensure_sharegpt_downloaded
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
-  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+  QUICK_BENCHMARK_ROOT=../.buildkite/performance-benchmarks/

  # dump vllm info via vllm collect-env
  env_output=$(vllm collect-env)
--- a/.buildkite/performance-benchmarks/tests/genai-perf-tests.json
+++ b/.buildkite/performance-benchmarks/tests/genai-perf-tests.json
--- a/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json
--- a/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
@@ -0,0 +1,55 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15,
+            "max-model-len": 256,
+            "async-scheduling": ""
+        }
+    },
+    {
+        "test_name": "latency_llama70B_tp4",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15,
+            "max-model-len": 256,
+            "async-scheduling": ""
+        }
+    },
+    {
+        "test_name": "latency_mixtral8x7B_tp2",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15,
+            "max-model-len": 256,
+            "async-scheduling": ""
+        }
+    }
+]
--- a/.buildkite/performance-benchmarks/tests/latency-tests.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests.json
--- a/.buildkite/performance-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/performance-benchmarks/tests/nightly-tests.json
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
--- a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
@@ -0,0 +1,82 @@
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "load_format": "dummy",
+            "max-model-len": 2048,
+            "max-num-seqs": 256,
+            "async-scheduling": ""
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama70B_tp4_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "load_format": "dummy",
+            "max-model-len": 2048,
+            "max-num-seqs": 256,
+            "async-scheduling": ""
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_mixtral8x7B_tp2_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "load_format": "dummy",
+            "max-model-len": 2048,
+            "max-num-seqs": 256,
+            "async-scheduling": ""
+        },
+        "client_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    }
+]
--- a/.buildkite/performance-benchmarks/tests/serving-tests.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests.json
--- a/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json
--- a/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
@@ -0,0 +1,61 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 1000,
+            "backend": "vllm",
+            "max-model-len": 2048,
+            "max-num-seqs": 512,
+            "async-scheduling": ""
+        }
+    },
+    {
+        "test_name": "throughput_llama70B_tp4",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 1000,
+            "backend": "vllm",
+            "max-model-len": 2048,
+            "max-num-seqs": 512,
+            "async-scheduling": ""
+        }
+    },
+    {
+        "test_name": "throughput_mixtral8x7B_tp2",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "load_format": "dummy",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 1000,
+            "backend": "vllm",
+            "max-model-len": 2048,
+            "max-num-seqs": 512,
+            "async-scheduling": ""
+        }
+    }
+]
--- a/.buildkite/performance-benchmarks/tests/throughput-tests.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests.json
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -116,24 +116,6 @@ steps:
    commands:
      - "bash .buildkite/scripts/annotate-release.sh"

-  - label: "Build and publish TPU release image"
-    depends_on: ~
-    if: build.env("NIGHTLY") == "1"
-    agents:
-      queue: tpu_queue_postmerge
-    commands:
-      - "yes | docker system prune -a"
-      - "git fetch --all"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
-      - "docker push vllm/vllm-tpu:nightly"
-      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
-    plugins:
-      - docker-login#v3.0.0:
-          username: vllmbot
-          password-env: DOCKERHUB_TOKEN
-    env:
-      DOCKER_BUILDKIT: "1"
-
  - input: "Provide Release version here"
    id: input-release-version
    fields:
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -2,16 +2,23 @@

 set -ex

-# Get release version and strip leading 'v' if present
-RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
-
-if [ -z "$RELEASE_VERSION" ]; then
-  echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
-  exit 1
+# Get release version, default to 1.0.0.dev for nightly/per-commit builds
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null | sed 's/^v//')
+if [ -z "${RELEASE_VERSION}" ]; then
+  RELEASE_VERSION="1.0.0.dev"
 fi

 buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
-To download the wheel:
+To download the wheel (by commit):
+\`\`\`
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
+
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
+\`\`\`
+
+To download the wheel (by version):
 \`\`\`
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -173,6 +173,14 @@ fi
 PARALLEL_JOB_COUNT=8
 MYPYTHONPATH=".."

+# Test that we're launching on the machine that has
+# proper access to GPUs
+render_gid=$(getent group render | cut -d: -f3)
+if [[ -z "$render_gid" ]]; then
+  echo "Error: 'render' group not found. This is required for GPU access." >&2
+  exit 1
+fi
+
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
  # assign job count as the number of shards used   
@@ -186,6 +194,7 @@ if [[ $commands == *"--shard-id="* ]]; then
        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
        --network=host \
        --shm-size=16gb \
+        --group-add "$render_gid" \
        --rm \
        -e HIP_VISIBLE_DEVICES="${GPU}" \
        -e HF_TOKEN \
@@ -217,8 +226,8 @@ else
          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
          --network=host \
          --shm-size=16gb \
+          --group-add "$render_gid" \
          --rm \
-          -e HIP_VISIBLE_DEVICES=0 \
          -e HF_TOKEN \
          -e AWS_ACCESS_KEY_ID \
          -e AWS_SECRET_ACCESS_KEY \
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -20,7 +20,10 @@ trap remove_docker_container EXIT

 # Run the image and test offline inference/tensor parallel
 docker run \
-    --device /dev/dri \
+    --device /dev/dri:/dev/dri \
+    --net=host \
+    --ipc=host \
+    --privileged \
    -v /dev/dri/by-path:/dev/dri/by-path \
    --entrypoint="" \
    -e "HF_TOKEN=${HF_TOKEN}" \
@@ -42,7 +45,7 @@ docker run \
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
    pytest -v -s v1/structured_output
-    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
+    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
    pytest -v -s v1/test_serial_utils.py
 '
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -0,0 +1,62 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+THRESHOLD=${1:-0.25}
+NUM_Q=${2:-1319}
+PORT=${3:-8010}
+OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
+mkdir -p "${OUT_DIR}"
+
+wait_for_server() {
+  local port=$1
+  timeout 600 bash -c '
+    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
+      sleep 1
+    done'
+}
+
+MODEL="deepseek-ai/DeepSeek-V2-lite"
+BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+
+cleanup() {
+  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
+    kill "${SERVER_PID}" 2>/dev/null || true
+    for _ in {1..20}; do
+      kill -0 "${SERVER_PID}" 2>/dev/null || break
+      sleep 0.5
+    done
+    kill -9 "${SERVER_PID}" 2>/dev/null || true
+  fi
+}
+trap cleanup EXIT
+
+for BACK in "${BACKENDS[@]}"; do
+  VLLM_DEEP_GEMM_WARMUP=skip \
+  VLLM_ALL2ALL_BACKEND=$BACK \
+  vllm serve "$MODEL" \
+    --enforce-eager \
+    --tensor-parallel-size 2 \
+    --data-parallel-size 2 \
+    --enable-expert-parallel \
+    --enable-eplb \
+    --trust-remote-code \
+    --max-model-len 2048 \
+    --port $PORT &
+  SERVER_PID=$!
+  wait_for_server $PORT
+
+  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
+  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 - <<PY
+import json; acc=json.load(open('${OUT}'))['accuracy']
+print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
+assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
+PY
+
+  cleanup
+  SERVER_PID=
+  sleep 1
+  PORT=$((PORT+1))
+done
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+THRESHOLD=${1:-0.8}
+NUM_Q=${2:-1319}
+PORT=${3:-8020}
+OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
+mkdir -p "${OUT_DIR}"
+
+wait_for_server() {
+  local port=$1
+  timeout 600 bash -c '
+    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
+      sleep 1
+    done'
+}
+
+MODEL="QWen/Qwen3-30B-A3B-FP8"
+BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+
+cleanup() {
+  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
+    kill "${SERVER_PID}" 2>/dev/null || true
+    for _ in {1..20}; do
+      kill -0 "${SERVER_PID}" 2>/dev/null || break
+      sleep 0.5
+    done
+    kill -9 "${SERVER_PID}" 2>/dev/null || true
+  fi
+}
+trap cleanup EXIT
+
+for BACK in "${BACKENDS[@]}"; do
+  VLLM_DEEP_GEMM_WARMUP=skip \
+  VLLM_ALL2ALL_BACKEND=$BACK \
+  vllm serve "$MODEL" \
+    --enforce-eager \
+    --tensor-parallel-size 2 \
+    --data-parallel-size 2 \
+    --enable-expert-parallel \
+    --trust-remote-code \
+    --max-model-len 2048 \
+    --port $PORT &
+  SERVER_PID=$!
+  wait_for_server $PORT
+
+  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
+  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 - <<PY
+import json; acc=json.load(open('${OUT}'))['accuracy']
+print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
+assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
+PY
+
+  cleanup
+  SERVER_PID=
+  sleep 1
+  PORT=$((PORT+1))
+done
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -38,7 +38,7 @@ steps:
 - label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some
  # of the dependencies. Please check the error message and add the package to whitelist
-  # in /vllm/tools/generate_nightly_torch_test.py
+  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
@@ -48,8 +48,8 @@ steps:
  commands:
  - bash standalone_tests/pytorch_nightly_dependency.sh

- label: Async Engine, Inputs, Utils, Worker Test # 36min
-  timeout_in_minutes: 50
+- label: Async Engine, Inputs, Utils, Worker Test # 10min
+  timeout_in_minutes: 15
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
@@ -286,7 +286,7 @@ steps:

 - label: Engine Test # 25min
  timeout_in_minutes: 40
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  #grade: Blocking
  source_file_dependencies:
@@ -318,7 +318,7 @@ steps:

 - label: V1 Test entrypoints # 35min
  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
@@ -344,7 +344,7 @@ steps:
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
    - pytest -v -s v1/spec_decode
-    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_lmcache_integration.py
    - pytest -v -s -m 'not cpu_test' v1/metrics
    - pytest -v -s v1/test_oracle.py
    - pytest -v -s v1/test_request.py
@@ -441,7 +441,7 @@ steps:
      --ignore=lora/test_llm_with_multi_loras.py \
      --ignore=lora/test_olmoe_tp.py \
      --ignore=lora/test_deepseekv2_tp.py \
-      --ignore=lora/test_gptoss.py \
+      --ignore=lora/test_gptoss_tp.py \
      --ignore=lora/test_qwen3moe_tp.py
  parallelism: 4

@@ -561,7 +561,7 @@ steps:

 - label: Model Executor Test # 23min
  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
@@ -616,9 +616,9 @@ steps:
  - uv pip install --system torchao==0.13.0
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py

- label: LM Eval Small Models # 53min
-  timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental]
+- label: LM Eval Small Models # 15min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
@@ -627,17 +627,18 @@ steps:
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1

- label: OpenAI API correctness # 22min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental]
+- label: OpenAI API correctness # 10min
+  timeout_in_minutes: 15
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - csrc/
  - vllm/entrypoints/openai/
  - vllm/model_executor/models/whisper.py
-  commands: # LMEval+Transcription WER check
-  - pytest -s entrypoints/openai/correctness/
+  commands: # LMEval
+  # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
+  - pytest -s entrypoints/openai/correctness/  --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py

 - label: OpenAI-Compatible Tool Use # 23 min
  timeout_in_minutes: 35
@@ -789,8 +790,10 @@ steps:
  - vllm/
  - tests/models/language/generation
  commands:
-    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'

 - label: Language Models Test (PPL)
@@ -856,10 +859,10 @@ steps:
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work

- label: Multi-Modal Accuracy Eval (Small Models) # 50min
-  mirror_hardwares: [amdexperimental]
+- label: Multi-Modal Accuracy Eval (Small Models) # 10min
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
-  timeout_in_minutes: 70
+  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - vllm/multimodal/
@@ -906,7 +909,7 @@ steps:

 - label: Quantized Models Test # 45 min
  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
@@ -1215,6 +1218,8 @@ steps:
    - pytest -v -s -x lora/test_llama_tp.py
    - pytest -v -s -x lora/test_llm_with_multi_loras.py
    - pytest -v -s -x lora/test_olmoe_tp.py
+    - pytest -v -s -x lora/test_gptoss_tp.py
+

 - label: Weight Loading Multiple GPU Test  # 33min
  timeout_in_minutes: 45
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -38,7 +38,7 @@ steps:
 - label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some
  # of the dependencies. Please check the error message and add the package to whitelist
-  # in /vllm/tools/generate_nightly_torch_test.py
+  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
  soft_fail: true
  source_file_dependencies:
  - requirements/nightly_torch_test.txt
@@ -205,6 +205,24 @@ steps:
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd

+- label: Distributed Tests (8 GPUs) # 4min
+  timeout_in_minutes: 10
+  gpu: h100
+  num_gpus: 8
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - examples/offline_inference/torchrun_dp_example.py
+  - vllm/config/parallel.py
+  - vllm/distributed/
+  - vllm/v1/engine/llm_engine.py
+  - vllm/v1/executor/uniproc_executor.py
+  - vllm/v1/worker/gpu_worker.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and dp=4 with ep
+  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+
 - label: EPLB Algorithm Test # 5min
  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
@@ -214,8 +232,8 @@ steps:
  commands:
  - pytest -v -s distributed/test_eplb_algo.py

- label: EPLB Execution Test # 5min
-  timeout_in_minutes: 15
+- label: EPLB Execution Test # 10min
+  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
@@ -223,6 +241,7 @@ steps:
  - tests/distributed/test_eplb_execute.py
  commands:
  - pytest -v -s distributed/test_eplb_execute.py
+  - pytest -v -s distributed/test_eplb_spec_decode.py

 - label: Metrics, Tracing Test # 12min
  timeout_in_minutes: 20
@@ -297,6 +316,7 @@ steps:
    - vllm/
    - tests/v1
  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    # split the test to avoid interference
    - pytest -v -s -m 'not cpu_test' v1/core
    - pytest -v -s v1/executor
@@ -313,6 +333,24 @@ steps:
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

+- label: V1 Test attention (H100) # 10min
+  timeout_in_minutes: 30
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - pytest -v -s v1/attention
+
+- label: V1 Test attention (B200) # 10min
+  timeout_in_minutes: 30
+  gpu: b200
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
+
 - label: V1 Test others (CPU) # 5 mins
  source_file_dependencies:
    - vllm/
@@ -390,9 +428,9 @@ steps:
      --ignore=lora/test_llm_with_multi_loras.py \
      --ignore=lora/test_olmoe_tp.py \
      --ignore=lora/test_deepseekv2_tp.py \
-      --ignore=lora/test_gptoss.py \
+      --ignore=lora/test_gptoss_tp.py \
      --ignore=lora/test_qwen3moe_tp.py
-      
+
  parallelism: 4

 - label: PyTorch Compilation Unit Tests # 15min
@@ -422,6 +460,7 @@ steps:
  - tests/compile
  commands:
  - pytest -v -s compile/test_basic_correctness.py
+  - pytest -v -s compile/test_multimodal_compile.py
  - pytest -v -s compile/piecewise/

 - label: PyTorch Fullgraph Test # 22min
@@ -433,7 +472,21 @@ steps:
  - tests/compile
  commands:
  - pytest -v -s compile/test_full_graph.py
-  - pytest -v -s compile/test_fusions_e2e.py
+    # Limit to no custom ops to reduce running time 
+    # Wrap with quotes to escape yaml and avoid starting -k string with a -
+  - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
+
+- label: Cudagraph test
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - tests/v1/cudagraph
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/config/compilation.py
+  - vllm/compilation
+  commands:
+    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py

 - label: Kernels Core Operation Test # 48min
  timeout_in_minutes: 75
@@ -477,6 +530,8 @@ steps:
  - tests/kernels/moe
  - vllm/model_executor/layers/fused_moe/
  - vllm/distributed/device_communicators/
+  - vllm/envs.py
+  - vllm/config
  commands:
    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2
@@ -493,8 +548,11 @@ steps:

 - label: Model Executor Test # 23min
  timeout_in_minutes: 35
+  torch_nightly: true
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
+  - vllm/engine/arg_utils.py
+  - vllm/config/model.py
  - vllm/model_executor
  - tests/model_executor
  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
@@ -687,8 +745,10 @@ steps:
  - vllm/
  - tests/models/language/generation
  commands:
-    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'

 - label: Language Models Test (PPL)
@@ -871,6 +931,29 @@ steps:
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_gpus=2 is not set
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # Wrap with quotes to escape yaml 
+    - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
+
+- label: Blackwell Fusion E2E Tests # 30 min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusions_e2e.py
+  commands:
+    - nvidia-smi
+    # Run all e2e fusion tests
    - pytest -v -s tests/compile/test_fusions_e2e.py

 - label: Blackwell GPT-OSS Eval
@@ -1076,6 +1159,7 @@ steps:
    - pytest -v -s -x lora/test_llama_tp.py
    - pytest -v -s -x lora/test_llm_with_multi_loras.py
    - pytest -v -s -x lora/test_olmoe_tp.py
+    - pytest -v -s -x lora/test_gptoss_tp.py


 - label: Weight Loading Multiple GPU Test  # 33min
@@ -1101,7 +1185,7 @@ steps:
  - tests/weight_loading
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
-  
+
 - label: NixlConnector PD accuracy tests (Distributed) # 30min
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
@@ -1143,6 +1227,19 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

+##### H100 test #####
+- label: LM Eval Large Models (H100) # optional
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
 ##### H200 test #####
 - label: Distributed Tests (H200) # optional
  gpu: h200
@@ -1156,6 +1253,7 @@ steps:
    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
    - pytest -v -s tests/distributed/test_context_parallel.py
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - pytest -v -s tests/v1/distributed/test_dbo.py

 ##### B200 test #####
 - label: Distributed Tests (B200) # optional
@@ -1166,6 +1264,7 @@ steps:
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+    - pytest -v -s tests/v1/distributed/test_dbo.py

 ##### RL Integration Tests #####
 - label: Prime-RL Integration Test # 15min
@@ -1178,3 +1277,21 @@ steps:
  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
    - bash .buildkite/scripts/run-prime-rl-test.sh
+
+- label: DeepSeek V2-Lite Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -9,7 +9,7 @@
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
-/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
+/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm @chaunceyjiang
@@ -105,11 +105,21 @@ mkdocs.yaml @hmellor
 /vllm/attention/ops/triton_unified_attention.py @tdoublep

 # ROCm related: specify owner with write access to notify AMD folks for careful code review
-/docker/Dockerfile.rocm* @gshtras
-/vllm/v1/attention/backends/rocm*.py @gshtras
-/vllm/v1/attention/backends/mla/rocm*.py @gshtras
-/vllm/attention/ops/rocm*.py @gshtras
-/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
+/vllm/**/*rocm* @tjtanaa
+/docker/Dockerfile.rocm* @gshtras @tjtanaa
+/vllm/v1/attention/backends/rocm*.py @gshtras @tjtanaa
+/vllm/v1/attention/backends/mla/rocm*.py @gshtras @tjtanaa
+/vllm/attention/ops/rocm*.py @gshtras @tjtanaa
+/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras @tjtanaa
+/csrc/rocm @gshtras @tjtanaa
+/requirements/*rocm* @tjtanaa
+/tests/**/*rocm* @tjtanaa
+/docs/**/*rocm* @tjtanaa
+/vllm/**/*quark* @tjtanaa
+/tests/**/*quark* @tjtanaa
+/docs/**/*quark* @tjtanaa
+/vllm/**/*aiter* @tjtanaa
+/tests/**/*aiter* @tjtanaa

 # TPU
 /vllm/v1/worker/tpu* @NickLucche
@@ -127,3 +137,8 @@ mkdocs.yaml @hmellor
 /vllm/config/pooler.py @noooop
 /vllm/pooling_params.py @noooop
 /vllm/model_executor/layers/pooler.py @noooop
+
+# Security guide and policies
+/docs/usage/security.md @russellb
+/SECURITY.md @russellb
+/docs/contributing/vulnerability_management.md @russellb
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -108,7 +108,7 @@ pull_request_rules:
      - files~=^benchmarks/
      - files~=^vllm/benchmarks/
      - files~=^tests/benchmarks/
-      - files~=^\.buildkite/nightly-benchmarks/
+      - files~=^\.buildkite/performance-benchmarks/
  actions:
    label:
      add:
--- a/.gitignore
+++ b/.gitignore
@@ -221,3 +221,6 @@ csrc/moe/marlin_moe_wna16/kernel_*

 # Ignore ep_kernels_workspace folder
 ep_kernels_workspace/
+
+# Allow tracked library source folders under submodules (e.g., benchmarks/lib)
+!vllm/benchmarks/lib/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -38,14 +38,14 @@ repos:
  rev: 0.9.1
  hooks:
    - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28]
+      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28, --python-version, "3.12"]
      files: ^requirements/test\.(in|txt)$
 - repo: local
  hooks:
  - id: format-torch-nightly-test
    name: reformat nightly_torch_test.txt to be in sync with test.in
    language: python
-    entry: python tools/generate_nightly_torch_test.py
+    entry: python tools/pre_commit/generate_nightly_torch_test.py
    files: ^requirements/test\.(in|txt)$
  - id: mypy-local
    name: Run mypy locally for lowest supported Python version
@@ -78,12 +78,12 @@ repos:
    stages: [manual] # Only run in CI
  - id: shellcheck
    name: Lint shell scripts
-    entry: tools/shellcheck.sh
+    entry: tools/pre_commit/shellcheck.sh
    language: script
    types: [shell]
  - id: png-lint
    name: Lint PNG exports from excalidraw
-    entry: tools/png-lint.sh
+    entry: tools/pre_commit/png-lint.sh
    language: script
    types: [png]
  - id: signoff-commit
@@ -100,12 +100,12 @@ repos:
    stages: [commit-msg]
  - id: check-spdx-header
    name: Check SPDX headers
-    entry: python tools/check_spdx_header.py
+    entry: python tools/pre_commit/check_spdx_header.py
    language: python
    types: [python]
  - id: check-root-lazy-imports
    name: Check root lazy imports
-    entry: python tools/check_init_lazy_imports.py
+    entry: python tools/pre_commit/check_init_lazy_imports.py
    language: python
    types: [python]
  - id: check-filenames
@@ -119,11 +119,11 @@ repos:
    pass_filenames: false
  - id: update-dockerfile-graph
    name: Update Dockerfile dependency graph
-    entry: tools/update-dockerfile-graph.sh
+    entry: tools/pre_commit/update-dockerfile-graph.sh
    language: script
  - id: enforce-import-regex-instead-of-re
    name: Enforce import regex as re
-    entry: python tools/enforce_regex_import.py
+    entry: python tools/pre_commit/enforce_regex_import.py
    language: python
    types: [python]
    pass_filenames: false
@@ -131,7 +131,7 @@ repos:
  # forbid directly import triton
  - id: forbid-direct-triton-import
    name: "Forbid direct 'import triton'"
-    entry: python tools/check_triton_import.py
+    entry: python tools/pre_commit/check_triton_import.py
    language: python
    types: [python]
    pass_filenames: false
@@ -144,7 +144,7 @@ repos:
    additional_dependencies: [regex]
  - id: validate-config
    name: Validate configuration has default values and that each field has a docstring
-    entry: python tools/validate_config.py
+    entry: python tools/pre_commit/validate_config.py
    language: python
    additional_dependencies: [regex]
  # Keep `suggestion` last
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -241,7 +241,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  message(STATUS "Enabling cumem allocator extension.")
  # link against cuda driver library
  list(APPEND CUMEM_LIBS CUDA::cuda_driver)
-  define_gpu_extension_target(
+  define_extension_target(
    cumem_allocator
    DESTINATION vllm
    LANGUAGE CXX
@@ -858,7 +858,7 @@ if (VLLM_GPU_LANG STREQUAL "HIP")
 endif()

 message(STATUS "Enabling C extension.")
-define_gpu_extension_target(
+define_extension_target(
  _C
  DESTINATION vllm
  LANGUAGE ${VLLM_GPU_LANG}
@@ -973,7 +973,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 endif()

 message(STATUS "Enabling moe extension.")
-define_gpu_extension_target(
+define_extension_target(
  _moe_C
  DESTINATION vllm
  LANGUAGE ${VLLM_GPU_LANG}
@@ -994,7 +994,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
    "csrc/rocm/skinny_gemms.cu"
    "csrc/rocm/attention.cu")

-  define_gpu_extension_target(
+  define_extension_target(
    _rocm_C
    DESTINATION vllm
    LANGUAGE ${VLLM_GPU_LANG}
--- a/README.md
+++ b/README.md
@@ -21,6 +21,8 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio

 *Latest News* 🔥

+- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
+- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
 - [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
 - [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
 - [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
@@ -82,7 +84,7 @@ vLLM is flexible and easy to use with:
 - Tensor, pipeline, data and expert parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
+- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, Arm CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
 - Prefix caching support
 - Multi-LoRA support

--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
@@ -5,7 +5,7 @@ import gc
 from benchmark_utils import TimeCollector
 from tabulate import tabulate

-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.v1.core.block_pool import BlockPool


--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@@ -46,7 +46,7 @@ import time

 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser


 def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -19,7 +19,7 @@ from vllm.config import (
    VllmConfig,
 )
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.worker.gpu_input_batch import InputBatch
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -37,7 +37,7 @@ from transformers import PreTrainedTokenizerBase

 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 try:
    from vllm.transformers_utils.tokenizer import get_tokenizer
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@@ -11,7 +11,7 @@ import time
 from transformers import AutoTokenizer, PreTrainedTokenizerBase

 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser


 # Select a equi-probable random priority
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -51,7 +51,7 @@ except ImportError:
    from backend_request_func import get_tokenizer

 try:
-    from vllm.utils import FlexibleArgumentParser
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
 except ImportError:
    from argparse import ArgumentParser as FlexibleArgumentParser

--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -15,7 +15,7 @@ from utils import make_rand_sparse_tensors
 from weight_shapes import WEIGHT_SHAPES

 from vllm import _custom_ops as ops
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -18,7 +18,8 @@ from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    w8a8_triton_block_scaled_mm,
 )
-from vllm.utils import FlexibleArgumentParser, cdiv
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.math_utils import cdiv

 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
--- a/benchmarks/kernels/bench_per_token_quant_fp8.py
+++ b/benchmarks/kernels/bench_per_token_quant_fp8.py
@@ -10,7 +10,7 @@ import torch
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.triton_utils import triton
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE


--- a/benchmarks/kernels/benchmark_activation.py
+++ b/benchmarks/kernels/benchmark_activation.py
@@ -10,7 +10,7 @@ import vllm.model_executor.layers.activation  # noqa F401
 from vllm.model_executor.custom_op import CustomOp
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE

 batch_size_range = [1, 16, 32, 64, 128]
--- a/benchmarks/kernels/benchmark_bitblas.py
+++ b/benchmarks/kernels/benchmark_bitblas.py
@@ -28,7 +28,7 @@ except ImportError as e:

 from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target

-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 parser = FlexibleArgumentParser(
    description="Benchmark BitBLAS int4 on a specific target."
--- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
+++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
@@ -20,7 +20,7 @@ from vllm.model_executor.layers.fused_moe.config import (
 from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
 from vllm.scalar_type import scalar_types
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 WEIGHT_SHAPES_MOE = {
    "nvidia/DeepSeek-R1-FP4": [
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -14,7 +14,7 @@ from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_confi
 from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 # Weight shapes for different models: [num_experts, topk, hidden_size,
 # intermediate_size]
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -39,7 +39,7 @@ from vllm.distributed.device_communicators.pynccl_allocator import (
 )
 from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
 from vllm.logger import init_logger
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 logger = init_logger(__name__)

--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -13,11 +13,11 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
    fused_experts,
    fused_topk,
 )
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 DEFAULT_MODELS = [
-    "nm-testing/Mixtral-8x7B-Instruct-v0.1",
-    "nm-testing/deepseekv2-lite",
+    "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "deepseek-ai/DeepSeek-V2-Lite",
    "ibm-granite/granite-3.0-1b-a400m",
    "ibm-granite/granite-3.0-3b-a800m",
 ]
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -7,7 +7,7 @@ import torch

 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE


--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -19,13 +19,24 @@ from torch.utils.benchmark import Measurement as TMeasurement
 from utils import ArgPool, Bench, CudaGraphBenchParams
 from weight_shapes import WEIGHT_SHAPES

-from vllm.triton_utils import HAS_TRITON
+from vllm.lora.ops.triton_ops.utils import get_lora_op_configs
+from vllm.triton_utils import HAS_TRITON, triton

 if HAS_TRITON:
-    from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
+    from vllm.lora.ops.triton_ops import (  ## added fused_moe_lora
+        LoRAKernelMeta,
+        fused_moe_lora_expand,
+        fused_moe_lora_shrink,
+        lora_expand,
+        lora_shrink,
+    )
+    from vllm.lora.ops.triton_ops.fused_moe_lora_op import (
+        _LORA_PTR_DICT,  ## added _LORA_PTR_DICT for fused_moe_lora
+    )
    from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
-
-from vllm.utils import FlexibleArgumentParser
+from vllm import _custom_ops as ops
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.math_utils import round_up

 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_TP_SIZES = [1]
@@ -59,6 +70,8 @@ DEFAULT_NUM_LORAS = [1, 2, 3, 4]
 DEFAULT_SORT_BY_LORA_IDS = [False, True]
 DEFAULT_SEQ_LENGTHS = [1]
 DEFAULT_EXPAND_FN_ADD_INPUTS = [True, False]
+DEFAULT_TOP_K_NUMS = [1]  # Added for MoE LoRA top_k
+DEFAULT_NUM_EXPERTS = [8]  # Added for MoE LoRA num_experts


 # Utilities
@@ -191,6 +204,11 @@ class OpType(Enum):

    LORA_SHRINK = auto()
    LORA_EXPAND = auto()
+    ## Adding support for fused moe lora
+    FUSED_MOE_LORA_GATE_UP_SHRINK = auto()  ## Gate/Up projection variant with shrink
+    FUSED_MOE_LORA_GATE_UP_EXPAND = auto()  ## Gate/Up projection variant with expand
+    FUSED_MOE_LORA_DOWN_SHRINK = auto()  ## Down projection variant with shrink
+    FUSED_MOE_LORA_DOWN_EXPAND = auto()  ## Down projection variant with expand

    @staticmethod
    def from_str(s: str) -> "OpType":
@@ -198,6 +216,15 @@ class OpType(Enum):
            return OpType.LORA_SHRINK
        if s.lower() == "lora_expand":
            return OpType.LORA_EXPAND
+        # Adding support for fused moe lora, both in gate_up and down
+        if s.lower() == "fused_moe_lora_gate_up_shrink":  ## Gate/Up variant with shrink
+            return OpType.FUSED_MOE_LORA_GATE_UP_SHRINK
+        if s.lower() == "fused_moe_lora_gate_up_expand":  ## Gate/Up variant with expand
+            return OpType.FUSED_MOE_LORA_GATE_UP_EXPAND
+        if s.lower() == "fused_moe_lora_down_shrink":  ## Down variant with shrink
+            return OpType.FUSED_MOE_LORA_DOWN_SHRINK
+        if s.lower() == "fused_moe_lora_down_expand":  ## Down variant with expand
+            return OpType.FUSED_MOE_LORA_DOWN_EXPAND
        raise ValueError(f"Unrecognized str {s} to convert to OpType")

    def is_shrink_fn(self) -> bool:
@@ -206,19 +233,56 @@ class OpType(Enum):
    def is_expand_fn(self) -> bool:
        return self in [OpType.LORA_EXPAND]

+    def is_fused_moe_lora_fn(self) -> bool:  ## adding for fused MoE LoRA
+        return self in [
+            OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
+            OpType.FUSED_MOE_LORA_DOWN_SHRINK,
+            OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
+            OpType.FUSED_MOE_LORA_DOWN_EXPAND,
+        ]
+
+    def is_fused_moe_lora_gate_up_fn(
+        self,
+    ) -> bool:  ## adding for fused MoE LoRA Gate/Up
+        return self in [
+            OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
+            OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
+        ]
+
+    def is_fused_moe_lora_down_fn(self) -> bool:  ## adding for fused MoE LoRA Down
+        return self in [
+            OpType.FUSED_MOE_LORA_DOWN_SHRINK,
+            OpType.FUSED_MOE_LORA_DOWN_EXPAND,
+        ]
+
+    def is_fused_moe_lora_shrink_fn(self) -> bool:
+        return self in [
+            OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
+            OpType.FUSED_MOE_LORA_DOWN_SHRINK,
+        ]
+
+    def is_fused_moe_lora_expand_fn(self) -> bool:
+        return self in [
+            OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
+            OpType.FUSED_MOE_LORA_DOWN_EXPAND,
+        ]
+
    def num_slices(self) -> list[int]:
+        if self.is_fused_moe_lora_gate_up_fn():
+            return [2]
+        elif self.is_fused_moe_lora_down_fn():
+            return [1]
        return [1, 2, 3]

    def mkn(
        self, batch_size: int, seq_length: int, hidden_size: int, lora_rank: int
    ) -> tuple[int, int, int]:
        num_tokens = batch_size * seq_length
-        if self.is_shrink_fn():
+        if self.is_shrink_fn() or self.is_fused_moe_lora_fn():
            m = num_tokens
            k = hidden_size
            n = lora_rank
-        else:
-            assert self.is_expand_fn()
+        elif self.is_expand_fn():
            m = num_tokens
            k = lora_rank
            n = hidden_size
@@ -232,9 +296,36 @@ class OpType(Enum):
        """
        if self.is_shrink_fn():
            return op_dtype, op_dtype, torch.float32
-        else:
-            assert self.is_expand_fn()
+        elif self.is_expand_fn():
            return torch.float32, op_dtype, op_dtype
+        else:
+            assert self.is_fused_moe_lora_fn()
+            return op_dtype, op_dtype, op_dtype
+
+    def matmul_shapes_fused_moe_lora(
+        self,
+        m: int,
+        n: int,
+        k: int,
+        num_loras: int,
+        num_slices: int,
+        top_k_num: int,
+        num_experts: int,
+    ) -> tuple[tuple[int], tuple[int], tuple[int], tuple[int]]:
+        if self.is_fused_moe_lora_shrink_fn():
+            input_shape = (
+                (m * top_k_num, n)
+                if self in [OpType.FUSED_MOE_LORA_DOWN_SHRINK]
+                else (m, n)
+            )
+            output_shape = (num_slices, m, top_k_num, k)
+            weight_shape = (num_loras, num_experts, k, n)
+        else:
+            assert self.is_fused_moe_lora_expand_fn()
+            input_shape = (num_slices, m, top_k_num, k)
+            output_shape = (m, top_k_num, n * num_slices)
+            weight_shape = (num_loras, num_experts, n, k)
+        return (input_shape, weight_shape, output_shape)

    def matmul_shapes(
        self,
@@ -244,6 +335,8 @@ class OpType(Enum):
        lora_rank: int,
        num_loras: int,
        num_slices: int,
+        top_k_num: int | None = None,
+        num_experts: int | None = None,
    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        """
        Given num_slices, return the shapes of the A, B, and C matrices
@@ -258,6 +351,16 @@ class OpType(Enum):
        if self in [OpType.LORA_EXPAND]:
            # LoRA expand kernels support num_slices inherently in the kernel
            return ((num_slices, m, k), b_shape, (m, n * num_slices))
+        if self.is_fused_moe_lora_fn():
+            return self.matmul_shapes_fused_moe_lora(
+                m,
+                k,
+                n,
+                num_loras,
+                num_slices,
+                top_k_num,
+                num_experts,
+            )
        raise ValueError(f"Unrecognized op_type {self}")

    def bench_fn(self) -> Callable:
@@ -265,6 +368,16 @@ class OpType(Enum):
            return lora_shrink
        if self == OpType.LORA_EXPAND:
            return lora_expand
+        if self in [
+            OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
+            OpType.FUSED_MOE_LORA_DOWN_SHRINK,
+        ]:
+            return fused_moe_lora_shrink
+        if self in [
+            OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
+            OpType.FUSED_MOE_LORA_DOWN_EXPAND,
+        ]:
+            return fused_moe_lora_expand

        raise ValueError(f"Unrecognized optype {self}")

@@ -318,6 +431,8 @@ class BenchmarkContext:
    sort_by_lora_id: bool
    dtype: torch.dtype
    seq_length: int | None = None
+    num_experts: int | None = None  # num_experts for MoE based ops
+    top_k_num: int | None = None  # top_k for MoE based ops
    num_slices: int | None = None  # num_slices for slice based ops

    def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
@@ -373,6 +488,11 @@ class BenchmarkTensors:
            f"{dtype_to_str(self.output.dtype)}"
        )

+    def get_num_tokens(self, size: int, top_k_num: int, op_type: OpType):
+        return (
+            size * top_k_num if op_type in [OpType.FUSED_MOE_LORA_DOWN_SHRINK] else size
+        )
+
    @staticmethod
    def make(
        ctx: BenchmarkContext, op_type: OpType, device: str = "cuda"
@@ -385,6 +505,8 @@ class BenchmarkTensors:
            ctx.lora_rank,
            ctx.num_loras,
            ctx.num_slices,
+            ctx.top_k_num,
+            ctx.num_experts,
        )
        a_type, b_type, c_type = op_type.matmul_dtypes(ctx.dtype)
        input_tensor, lora_weights, output_tensor = make_rand_tensors(
@@ -432,17 +554,27 @@ class BenchmarkTensors:
            prompt_lora_indices_tensor,
        )

-    def sanity_check(self) -> None:
+    def sanity_check(self, ctx: BenchmarkContext, op_type: OpType) -> None:
        """
        Fails asserts when non-conformality is detected.
        """
-        num_tokens = self.input.shape[-2]
+        num_tokens = (
+            self.input.shape[1]
+            if op_type.is_fused_moe_lora_expand_fn()
+            else self.input.shape[-2]
+        )
        # check metadata tensors
-        assert torch.sum(self.seq_lens) == num_tokens
+        ## In down shrink case, each token is repeated top_k_num times
+        assert num_tokens == self.get_num_tokens(
+            torch.sum(self.seq_lens), ctx.top_k_num, op_type
+        ), f"Expected {num_tokens} tokens, but got {torch.sum(self.seq_lens)}"
        num_seqs = self.seq_lens.shape[0]
        # assert self.seq_start_loc.shape[0] == num_seqs
+        ## In down shrink case, each prompt corresponds to top_k_num sequences
        assert self.prompt_lora_mapping.shape[0] == num_seqs
-        assert self.lora_kernel_meta.token_lora_mapping.shape[0] == num_tokens
+        assert self.get_num_tokens(
+            self.lora_kernel_meta.token_lora_mapping.shape[0], ctx.top_k_num, op_type
+        )

    def to_device(self, device: str):
        """
@@ -471,21 +603,111 @@ class BenchmarkTensors:
                to_device(field) if field_name != "no_lora_flag_cpu" else field,
            )

-    def metadata(self) -> tuple[int, int, int]:
+    def metadata(self, ctx: BenchmarkContext, op_type: OpType) -> tuple[int, int, int]:
        """
        Return num_seqs, num_tokens and max_seq_len
        """
        num_seqs = self.seq_lens.shape[0]
-        num_tokens = self.lora_kernel_meta.token_lora_mapping.shape[0]
+        num_tokens = self.get_num_tokens(
+            self.lora_kernel_meta.token_lora_mapping.shape[0], ctx.top_k_num, op_type
+        )
        max_seq_len = torch.max(self.seq_lens).item()
        num_slices = len(self.lora_weights_lst)
        return num_seqs, num_tokens, max_seq_len, num_slices

-    def as_lora_shrink_kwargs(self) -> dict[str, Any]:
-        self.sanity_check()
+    def fused_moe_lora_data_prepare(
+        self,
+        block_size: int,
+        token_lora_mapping: torch.Tensor,
+        ctx: BenchmarkContext,
+    ):
+        def moe_lora_align_block_size(
+            topk_ids: torch.Tensor,
+            token_lora_mapping: torch.Tensor,
+            block_size: int,
+            num_experts: int,
+            max_loras: int,
+            expert_map: torch.Tensor | None = None,
+            pad_sorted_ids: bool = False,
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+            """
+            Aligns tokens and experts into block-sized chunks for LoRA-based
+            mixture-of-experts (MoE) execution.
+            """
+            max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+            if pad_sorted_ids:
+                max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
+            sorted_ids = torch.empty(
+                (max_loras * max_num_tokens_padded,),
+                dtype=torch.int32,
+                device=topk_ids.device,
+            )
+            max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
+            # Expert ids must be set default to -1 to prevent a blank block
+            expert_ids = torch.empty(
+                (max_loras * max_num_m_blocks,),
+                dtype=torch.int32,
+                device=topk_ids.device,
+            )
+            num_tokens_post_pad = torch.empty(
+                (max_loras), dtype=torch.int32, device=topk_ids.device
+            )
+
+            ops.moe_lora_align_block_size(
+                topk_ids,
+                token_lora_mapping,
+                num_experts,
+                block_size,
+                max_loras,
+                max_num_tokens_padded,
+                max_num_m_blocks,
+                sorted_ids,
+                expert_ids,
+                num_tokens_post_pad,
+            )
+            if expert_map is not None:
+                expert_ids = expert_map[expert_ids]
+
+            return sorted_ids, expert_ids, num_tokens_post_pad
+
+        num_tokens = ctx.batch_size
+        curr_topk_ids = torch.randint(
+            0,
+            ctx.num_experts,
+            (num_tokens, ctx.top_k_num),
+            device="cuda",
+            dtype=torch.int32,
+        )
+        topk_weights = torch.randint(
+            0,
+            ctx.num_experts,
+            (num_tokens, ctx.top_k_num),
+            device="cuda",
+            dtype=torch.int32,
+        )
+
+        (sorted_token_ids_lora, expert_ids_lora, num_tokens_post_padded_lora) = (
+            moe_lora_align_block_size(
+                topk_ids=curr_topk_ids,
+                token_lora_mapping=token_lora_mapping,
+                block_size=block_size,
+                num_experts=ctx.num_experts,
+                max_loras=ctx.num_loras,
+            )
+        )
+
+        sorted_token_ids = sorted_token_ids_lora.view(ctx.num_loras, -1)
+        expert_ids = expert_ids_lora.view(ctx.num_loras, -1)
+        num_tokens_post_padded = num_tokens_post_padded_lora
+        return (topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded)
+
+    def as_lora_shrink_kwargs(
+        self, ctx: BenchmarkContext, op_type: OpType
+    ) -> dict[str, Any]:
+        self.sanity_check(ctx, op_type)
        self.to_device(self.input.device)

-        _, num_tokens, _, num_slices = self.metadata()
+        _, num_tokens, _, num_slices = self.metadata(ctx, op_type)

        # Sanity check matrix shapes.
        i_shape, lw_shape, o_shape = (
@@ -520,11 +742,13 @@ class BenchmarkTensors:
            "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
        }

-    def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
-        self.sanity_check()
+    def as_lora_expand_kwargs(
+        self, ctx: BenchmarkContext, op_type: OpType, add_inputs: bool
+    ) -> dict[str, Any]:
+        self.sanity_check(ctx, op_type)
        self.to_device(self.input.device)

-        _, num_tokens, _, num_slices = self.metadata()
+        _, num_tokens, _, num_slices = self.metadata(ctx, op_type)

        # Sanity check matrix shapes.
        i_shape, lw_shape, o_shape = (
@@ -561,18 +785,173 @@ class BenchmarkTensors:
            "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
        }

-    def bench_fn_kwargs(
-        self, op_type: OpType, add_inputs: bool | None = None
+    def as_fused_moe_lora_shrink_kwargs(
+        self, ctx: BenchmarkContext, op_type: OpType
    ) -> dict[str, Any]:
-        if op_type.is_shrink_fn():
+        self.sanity_check(ctx, op_type)
+        self.to_device(self.input.device)
+
+        _, num_tokens, _, num_slices = self.metadata(ctx, op_type)
+
+        # Sanity check matrix shapes.
+        i_shape, lw_shape, o_shape = (
+            self.input.shape,
+            self.lora_weights_lst[0].shape,
+            self.output.shape,
+        )
+        # Expected input shape : [num_tokens, hidden_size] for gate_up
+        # Expected input shape : [top_k_num * num_tokens, hidden_size] for down
+        assert len(i_shape) == 2
+        assert i_shape[0] == num_tokens
+        hidden_size = i_shape[1]
+        # Expected lora weight shape [max_lora, num_experts, lora_rank, hidden_size]
+        assert len(lw_shape) == 4
+        assert lw_shape[-1] == hidden_size
+        lora_rank = lw_shape[-2]
+        # Expected output shape : [num_slices, num_tokens, top_k_num, lora_rank]
+        assert len(o_shape) == 4
+        assert (
+            o_shape
+            == (num_slices, num_tokens // ctx.top_k_num, ctx.top_k_num, lora_rank)
+            if op_type in [OpType.FUSED_MOE_LORA_DOWN_SHRINK]
+            else o_shape == (num_slices, num_tokens, ctx.top_k_num, lora_rank)
+        )
+        kernel_config = get_lora_op_configs(
+            op_type.name.lower(),
+            max_loras=lw_shape[0],
+            batch=num_tokens,
+            hidden_size=hidden_size,
+            rank=lora_rank,
+            num_slices=num_slices,
+            add_inputs=False,
+        )
+
+        (topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded) = (
+            self.fused_moe_lora_data_prepare(
+                block_size=kernel_config["BLOCK_SIZE_M"],
+                token_lora_mapping=self.lora_kernel_meta.token_lora_mapping,
+                ctx=ctx,
+            )
+        )
+
+        return {
+            "qcurr_hidden_states": self.input,
+            "lora_a_stacked": self.lora_weights_lst,
+            "a_intermediate_cache1": self.output,
+            "topk_weights": topk_weights,
+            "sorted_token_ids": sorted_token_ids,
+            "expert_ids": expert_ids,
+            "num_tokens_post_padded": num_tokens_post_padded,
+            "top_k_num": ctx.top_k_num,
+            "device": self.input.device,
+            "N": lora_rank,
+            "M": topk_weights.shape[0],
+            "EM": sorted_token_ids.shape[1],
+            "K": self.input.shape[1],
+            "num_tokens": num_tokens,
+            "num_experts": ctx.num_experts,
+            "num_slices": num_slices,
+            "shrink_block_size_m": kernel_config["BLOCK_SIZE_M"],
+            "shrink_block_size_n": kernel_config["BLOCK_SIZE_N"],
+            "shrink_block_size_k": kernel_config["BLOCK_SIZE_K"],
+            "shrink_group_size_m": kernel_config["GROUP_SIZE_M"],
+            "shrink_num_warps": kernel_config["NUM_WARPS"],
+            "shrink_num_stages": kernel_config["NUM_STAGES"],
+            "shrink_split_k": kernel_config.get("SPLIT_K", 1),
+            "mul_routed_weight": op_type.is_fused_moe_lora_down_fn(),
+        }
+
+    def as_fused_moe_lora_expand_kwargs(
+        self, ctx: BenchmarkContext, op_type: OpType
+    ) -> dict[str, Any]:
+        self.sanity_check(ctx, op_type)
+        self.to_device(self.input.device)
+
+        _, num_tokens, _, num_slices = self.metadata(ctx, op_type)
+
+        # Sanity check matrix shapes.
+        i_shape, lw_shape, o_shape = (
+            self.input.shape,
+            self.lora_weights_lst[0].shape,
+            self.output.shape,
+        )
+
+        # Expected input shape : [num_slices, num_tokens, top_k_num, lora_rank]
+        assert len(i_shape) == 4
+        assert i_shape[0] == num_slices
+        assert i_shape[1] == num_tokens
+        lora_rank = i_shape[-1]
+        # Expected lora weight shape : [num_loras, num_experts, hidden_size, lora_rank]
+        assert len(lw_shape) == 4
+        assert lw_shape[-1] == lora_rank
+        hidden_size = lw_shape[-2]
+        # Expected output shape : [num_tokens, top_k_num, hidden_size * num_slices]
+        assert len(o_shape) == 3
+        assert o_shape == (num_tokens, ctx.top_k_num, hidden_size * num_slices)
+
+        kernel_config = get_lora_op_configs(
+            op_type.name.lower(),
+            max_loras=lw_shape[0],
+            batch=num_tokens,
+            hidden_size=hidden_size,
+            rank=lora_rank,
+            num_slices=num_slices,
+            add_inputs=False,
+        )
+
+        (topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded) = (
+            self.fused_moe_lora_data_prepare(
+                block_size=kernel_config["BLOCK_SIZE_M"],
+                token_lora_mapping=self.lora_kernel_meta.token_lora_mapping,
+                ctx=ctx,
+            )
+        )
+
+        return {
+            "a_intermediate_cache1": self.input,
+            "lora_b_stacked": self.lora_weights_lst,
+            "output": self.output,
+            "topk_weights": topk_weights,
+            "sorted_token_ids": sorted_token_ids,
+            "expert_ids": expert_ids,
+            "num_tokens_post_padded": num_tokens_post_padded,
+            "top_k_num": ctx.top_k_num,
+            "device": self.input.device,
+            "N": lora_rank,
+            "M": topk_weights.shape[0],
+            "EM": sorted_token_ids.shape[1],
+            "K": self.input.shape[1],
+            "num_tokens": num_tokens,
+            "num_experts": ctx.num_experts,
+            "num_slices": num_slices,
+            "max_lora_rank": lora_rank,
+            "w1_output_dim_size": lw_shape[2],
+            "expand_block_size_m": kernel_config["BLOCK_SIZE_M"],
+            "expand_block_size_n": kernel_config["BLOCK_SIZE_N"],
+            "expand_block_size_k": kernel_config["BLOCK_SIZE_K"],
+            "expand_group_size_m": kernel_config["GROUP_SIZE_M"],
+            "expand_num_warps": kernel_config["NUM_WARPS"],
+            "expand_num_stages": kernel_config["NUM_STAGES"],
+            "expand_split_k": kernel_config.get("SPLIT_K", 1),
+            "mul_routed_weight": op_type.is_fused_moe_lora_down_fn(),
+        }
+
+    def bench_fn_kwargs(
+        self, ctx: BenchmarkContext, op_type: OpType, add_inputs: bool | None = None
+    ) -> dict[str, Any]:
+        if op_type.is_shrink_fn() or op_type.is_fused_moe_lora_fn():
            assert add_inputs is None
        else:
            assert add_inputs is not None

        if op_type == OpType.LORA_SHRINK:
-            return self.as_lora_shrink_kwargs()
+            return self.as_lora_shrink_kwargs(ctx, op_type)
        if op_type == OpType.LORA_EXPAND:
-            return self.as_lora_expand_kwargs(add_inputs)
+            return self.as_lora_expand_kwargs(ctx, op_type, add_inputs)
+        if op_type.is_fused_moe_lora_shrink_fn():
+            return self.as_fused_moe_lora_shrink_kwargs(ctx, op_type)
+        if op_type.is_fused_moe_lora_expand_fn():
+            return self.as_fused_moe_lora_expand_kwargs(ctx, op_type)
        raise ValueError(f"Unrecognized optype {self}")

    def test_correctness(
@@ -617,7 +996,7 @@ def bench_optype(
    test_correctness: bool = False,
 ) -> TMeasurement:
    assert arg_pool_size >= 1
-    if op_type.is_shrink_fn():
+    if op_type.is_shrink_fn() or op_type.is_fused_moe_lora_fn():
        assert expand_fn_add_inputs is None
    else:
        assert expand_fn_add_inputs is not None
@@ -627,23 +1006,30 @@ def bench_optype(
        BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)
    ]
    for bt in bench_tensors:
-        bt.sanity_check()
+        bt.sanity_check(ctx, op_type)

    # Test correctness of our implementation.
    if test_correctness:
+        assert op_type in [OpType.LORA_SHRINK, OpType.LORA_EXPAND], (
+            f"Correctness testing is not supported for {op_type.name}."
+        )
        assert all(
-            [bt.test_correctness(op_type, expand_fn_add_inputs) for bt in bench_tensors]
+            [
+                bt.test_correctness(ctx, op_type, expand_fn_add_inputs)
+                for bt in bench_tensors
+            ]
        )

    # BenchmarkTensors -> dict (kwargs)
    kwargs_list = [
-        bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs)
+        bt.bench_fn_kwargs(ctx, op_type, add_inputs=expand_fn_add_inputs)
        for bt in bench_tensors
    ]

    # Clear LoRA optimization hash-maps.
    _LORA_A_PTR_DICT.clear()
    _LORA_B_PTR_DICT.clear()
+    _LORA_PTR_DICT.clear()
    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
    for kwargs in kwargs_list:
        op_type.bench_fn()(**kwargs)
@@ -793,7 +1179,9 @@ def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]):

                    # Benchmark bench_op
                    expand_fn_add_inputs = (
-                        [None] if bench_op.is_shrink_fn() else args.expand_fn_add_inputs
+                        [None]
+                        if bench_op.is_shrink_fn() or bench_op.is_fused_moe_lora_fn()
+                        else args.expand_fn_add_inputs
                    )
                    for add_input_arg in expand_fn_add_inputs:
                        seq_len_timers.append(
@@ -831,12 +1219,22 @@ def as_benchmark_contexts(
    hidden_sizes: list[int], lora_ranks: list[int], args: argparse.Namespace
 ) -> list[BenchmarkContext]:
    ctxs: list[BenchmarkContext] = []
-    for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product(  # noqa
+    for (
+        batch_size,
+        hidden_size,
+        lora_rank,
+        num_loras,
+        sort_by_lora_id,
+        top_k_num,
+        num_experts,
+    ) in product(  # noqa
        args.batch_sizes,
        list(hidden_sizes),
        lora_ranks,
        args.num_loras,
        args.sort_by_lora_id,
+        args.top_k_nums,
+        args.num_experts,
    ):
        ctxs.append(
            BenchmarkContext(
@@ -851,6 +1249,8 @@ def as_benchmark_contexts(
                seq_length=None,
                sort_by_lora_id=sort_by_lora_id,
                dtype=args.dtype,
+                top_k_num=top_k_num,
+                num_experts=num_experts,
                # To be filled based on the OpType to benchmark
                num_slices=None,
            )
@@ -1012,6 +1412,22 @@ if __name__ == "__main__":
            ),
        )

+        p.add_argument(
+            "--top-k-nums",
+            nargs="+",
+            type=int,
+            default=DEFAULT_TOP_K_NUMS,
+            help="Top-K values for MoE LoRA operations",
+        )
+
+        p.add_argument(
+            "--num-experts",
+            nargs="+",
+            type=int,
+            default=DEFAULT_NUM_EXPERTS,
+            help="Number of experts for MoE LoRA operations",
+        )
+
    parser = FlexibleArgumentParser(
        description=f"""
 Benchmark LoRA kernels:
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -33,7 +33,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
    quantize_weights,
 )
 from vllm.scalar_type import ScalarType, scalar_types
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -44,7 +44,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
    sort_weights,
 )
 from vllm.scalar_type import ScalarType, scalar_types
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -22,7 +22,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import *
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_config
 from vllm.triton_utils import triton
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 FP8_DTYPE = current_platform.fp8_dtype()

@@ -211,7 +211,7 @@ def get_rocm_tuning_space(use_fp16):
    num_warps_range = [1, 2, 4, 8]
    group_m_range = [1, 4, 8, 16, 32]
    num_stage_range = [2]
-    waves_per_eu_range = [0]
+    waves_per_eu_range = [0, 1, 2, 4]
    matrix_instr_nonkdim_range = [16, 32] if use_fp16 else []
    kpack_range = [1, 2] if use_fp16 else []

@@ -590,6 +590,7 @@ def main(args: argparse.Namespace):
        "DeepseekV3ForCausalLM",
        "DeepseekV32ForCausalLM",
        "Glm4MoeForCausalLM",
+        "NemotronHForCausalLM",
    ):
        E = config.n_routed_experts
        topk = config.num_experts_per_tok
@@ -615,6 +616,11 @@ def main(args: argparse.Namespace):
        topk = config.moe_topk[0]
        intermediate_size = config.moe_intermediate_size[0]
        hidden_size = config.hidden_size
+    elif config.architectures[0] in ["Qwen3OmniMoeForConditionalGeneration"]:
+        E = config.thinker_config.text_config.num_experts
+        topk = config.thinker_config.text_config.num_experts_per_tok
+        intermediate_size = config.thinker_config.text_config.moe_intermediate_size
+        hidden_size = config.thinker_config.text_config.hidden_size
    else:
        # Support for llama4
        config = config.get_text_config()
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -17,7 +17,7 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
 )
 from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 FP8_DTYPE = current_platform.fp8_dtype()

--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -39,7 +39,7 @@ import torch
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_config
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -9,7 +9,7 @@ import torch
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import (
    STR_DTYPE_TO_TORCH_DTYPE,
    create_kv_caches_with_random,
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -7,7 +7,7 @@ import torch

 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE


--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@@ -9,7 +9,7 @@ from tabulate import tabulate
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import (
    STR_DTYPE_TO_TORCH_DTYPE,
    create_kv_caches_with_random,
--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@@ -12,7 +12,7 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import (
 )
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import (
    STR_DTYPE_TO_TORCH_DTYPE,
    create_kv_caches_with_random_flash,
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -8,7 +8,7 @@ import torch

 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser


 def benchmark_rope_kernels_multi_lora(
--- a/benchmarks/kernels/benchmark_shapes.py
+++ b/benchmarks/kernels/benchmark_shapes.py
@@ -78,11 +78,11 @@ WEIGHT_SHAPES = {
 }

 WEIGHT_SHAPES_MOE = {
-    "nm-testing/Mixtral-8x7B-Instruct-v0.1": [
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": [
        [8, 2, 4096, 28672],
        [8, 2, 14336, 4096],
    ],
-    "nm-testing/deepseekv2-lite": [
+    "deepseek-ai/DeepSeek-V2-Lite": [
        [64, 6, 2048, 1408],
    ],
    "ibm-granite/granite-3.0-1b-a400m": [
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -8,7 +8,7 @@ from datetime import datetime
 import flashinfer
 import torch

-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up

 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 FP8_DTYPE = torch.float8_e4m3fn
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -8,7 +8,7 @@ from datetime import datetime
 import flashinfer
 import torch

-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up

 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 FP8_DTYPE = torch.float8_e4m3fn
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -18,7 +18,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 )
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 mp.set_start_method("spawn", force=True)

--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@@ -11,7 +11,7 @@ import regex as re
 import seaborn as sns
 from torch.utils.benchmark import Measurement as TMeasurement

-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -1429,8 +1429,6 @@ async def main() -> None:
    random.seed(args.seed)
    np.random.seed(args.seed)

-    if not os.path.exists(args.model):
-        raise OSError(f"Path does not exist: {args.model}")
    logger.info("Loading tokenizer")
    tokenizer = AutoTokenizer.from_pretrained(args.model)

--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@@ -5,7 +5,7 @@ import cProfile
 import pstats

 from vllm import LLM, SamplingParams
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 # A very long prompt, total number of tokens is about 15k.
 LONG_PROMPT = ["You are an expert in large language models, aren't you?"] * 1000
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -212,11 +212,24 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
        # Build ACL with scons
        include(ProcessorCount)
        ProcessorCount(_NPROC)
+        set(_scons_cmd
+        scons -j${_NPROC}
+            Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
+            arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
+            multi_isa=1 openmp=1 cppthreads=0
+        )
+
+        # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
+        # and create a local shim dir with it
+        include("${CMAKE_CURRENT_LIST_DIR}/utils.cmake")
+        vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
+
+        if(NOT VLLM_TORCH_GOMP_SHIM_DIR STREQUAL "")
+            list(APPEND _scons_cmd extra_link_flags=-L${VLLM_TORCH_GOMP_SHIM_DIR})
+        endif()
+
        execute_process(
-            COMMAND scons -j${_NPROC}
-                    Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
-                    arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
-                    multi_isa=1 openmp=1 cppthreads=0
+            COMMAND ${_scons_cmd}
            WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
            RESULT_VARIABLE _acl_rc
        )
@@ -330,7 +343,7 @@ message(STATUS "CPU extension source files: ${VLLM_EXT_SRC}")
 # Define extension targets
 #

-define_gpu_extension_target(
+define_extension_target(
    _C
    DESTINATION vllm
    LANGUAGE CXX
@@ -341,4 +354,4 @@ define_gpu_extension_target(
    WITH_SOABI
 )

-message(STATUS "Enabling C extension.")
+message(STATUS "Enabling C extension.")
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@@ -92,7 +92,7 @@ if(FLASH_MLA_ARCHS)
        SRCS "${FlashMLA_Extension_SOURCES}"
        CUDA_ARCHS "${FLASH_MLA_ARCHS}")

-    define_gpu_extension_target(
+    define_extension_target(
        _flashmla_C
        DESTINATION vllm
        LANGUAGE ${VLLM_GPU_LANG}
@@ -109,7 +109,7 @@ if(FLASH_MLA_ARCHS)
        $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
        $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)

-    define_gpu_extension_target(
+    define_extension_target(
        _flashmla_extension_C
        DESTINATION vllm
        LANGUAGE ${VLLM_GPU_LANG}
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG a893712401d70362fbb299cd9c4b3476e8e9ed54
+          GIT_TAG 8e1b01d56210dc72030a2d0d41c2d8d266ba6309
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -129,6 +129,44 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
  set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE)
 endfunction()

+# Find libgomp that gets shipped with PyTorch wheel and create a shim dir with:
+#   libgomp.so    -> libgomp-<hash>.so...
+#   libgomp.so.1  -> libgomp-<hash>.so...
+# OUTPUT: TORCH_GOMP_SHIM_DIR  ("" if not found)
+function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
+  set(${TORCH_GOMP_SHIM_DIR} "" PARENT_SCOPE)
+
+  # Use run_python to locate vendored libgomp; never throw on failure.
+  run_python(_VLLM_TORCH_GOMP_PATH
+    "
+import os, glob
+try:
+  import torch
+  torch_pkg = os.path.dirname(torch.__file__)
+  site_root = os.path.dirname(torch_pkg)
+  torch_libs = os.path.join(site_root, 'torch.libs')
+  print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0])
+except:
+  print('')
+"
+    "failed to probe torch.libs for libgomp")
+
+  if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
+    return()
+  endif()
+
+  # Create shim under the build tree
+  set(_shim "${CMAKE_BINARY_DIR}/gomp_shim")
+  file(MAKE_DIRECTORY "${_shim}")
+
+  execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so.1")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so.1")
+
+  set(${TORCH_GOMP_SHIM_DIR} "${_shim}" PARENT_SCOPE)
+endfunction()
+
 # Macro for converting a `gencode` version number to a cmake version number.
 macro(string_to_ver OUT_VER IN_STR)
  string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
@@ -415,21 +453,20 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
 endmacro()

 #
-# Define a target named `GPU_MOD_NAME` for a single extension. The
+# Define a target named `MOD_NAME` for a single extension. The
 # arguments are:
 #
 # DESTINATION <dest>         - Module destination directory.
-# LANGUAGE <lang>            - The GPU language for this module, e.g CUDA, HIP,
-#                              etc.
+# LANGUAGE <lang>            - The language for this module, e.g. CUDA, HIP,
+#                              CXX, etc.
 # SOURCES <sources>          - List of source files relative to CMakeLists.txt
 #                              directory.
 #
 # Optional arguments:
 #
-# ARCHITECTURES <arches>     - A list of target GPU architectures in cmake
-#                              format.
-#                              Refer `CMAKE_CUDA_ARCHITECTURES` documentation
-#                              and `CMAKE_HIP_ARCHITECTURES` for more info.
+# ARCHITECTURES <arches>     - A list of target architectures in cmake format.
+#                              For GPU, refer to CMAKE_CUDA_ARCHITECTURES and
+#                              CMAKE_HIP_ARCHITECTURES for more info.
 #                              ARCHITECTURES will use cmake's defaults if
 #                              not provided.
 # COMPILE_FLAGS <flags>      - Extra compiler flags passed to NVCC/hip.
@@ -440,63 +477,61 @@ endmacro()
 #
 # Note: optimization level/debug info is set via cmake build type.
 #
-function (define_gpu_extension_target GPU_MOD_NAME)
+function (define_extension_target MOD_NAME)
  cmake_parse_arguments(PARSE_ARGV 1
-    GPU
+    ARG
    "WITH_SOABI"
    "DESTINATION;LANGUAGE;USE_SABI"
    "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")

  # Add hipify preprocessing step when building with HIP/ROCm.
-  if (GPU_LANGUAGE STREQUAL "HIP")
-    hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}")
+  if (ARG_LANGUAGE STREQUAL "HIP")
+    hipify_sources_target(ARG_SOURCES ${MOD_NAME} "${ARG_SOURCES}")
  endif()

-  if (GPU_WITH_SOABI)
-    set(GPU_WITH_SOABI WITH_SOABI)
+  if (ARG_WITH_SOABI)
+    set(SOABI_KEYWORD WITH_SOABI)
  else()
-    set(GPU_WITH_SOABI)
+    set(SOABI_KEYWORD "")
  endif()

-  if (GPU_USE_SABI)
-    Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}")
+  if (ARG_USE_SABI)
+    Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}")
  else()
-    Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}")
+    Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}")
  endif()

-  if (GPU_LANGUAGE STREQUAL "HIP")
+  if (ARG_LANGUAGE STREQUAL "HIP")
    # Make this target dependent on the hipify preprocessor step.
-    add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME})
+    add_dependencies(${MOD_NAME} hipify${MOD_NAME})
    # Make sure we include the hipified versions of the headers, and avoid conflicts with the ones in the original source folder
-    target_include_directories(${GPU_MOD_NAME} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/csrc
-      ${GPU_INCLUDE_DIRECTORIES})
+    target_include_directories(${MOD_NAME} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/csrc
+      ${ARG_INCLUDE_DIRECTORIES})
  else()
-    target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
-      ${GPU_INCLUDE_DIRECTORIES})
+    target_include_directories(${MOD_NAME} PRIVATE csrc
+      ${ARG_INCLUDE_DIRECTORIES})
  endif()

-  if (GPU_ARCHITECTURES)
-    set_target_properties(${GPU_MOD_NAME} PROPERTIES
-      ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
+  if (ARG_ARCHITECTURES)
+    set_target_properties(${MOD_NAME} PROPERTIES
+      ${ARG_LANGUAGE}_ARCHITECTURES "${ARG_ARCHITECTURES}")
  endif()

+  target_compile_options(${MOD_NAME} PRIVATE
+    $<$<COMPILE_LANGUAGE:${ARG_LANGUAGE}>:${ARG_COMPILE_FLAGS}>)

-  target_compile_options(${GPU_MOD_NAME} PRIVATE
-    $<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>)
+  target_compile_definitions(${MOD_NAME} PRIVATE
+    "-DTORCH_EXTENSION_NAME=${MOD_NAME}")

-  target_compile_definitions(${GPU_MOD_NAME} PRIVATE
-    "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}")
-
-
-  target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})
+  target_link_libraries(${MOD_NAME} PRIVATE torch ${ARG_LIBRARIES})

  # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
  # dependencies that are not necessary and may not be installed.
-  if (GPU_LANGUAGE STREQUAL "CUDA")
-    target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver)
+  if (ARG_LANGUAGE STREQUAL "CUDA")
+    target_link_libraries(${MOD_NAME} PRIVATE torch CUDA::cudart CUDA::cuda_driver ${ARG_LIBRARIES})
  else()
-    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
+    target_link_libraries(${MOD_NAME} PRIVATE torch ${TORCH_LIBRARIES} ${ARG_LIBRARIES})
  endif()

-  install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME})
+  install(TARGETS ${MOD_NAME} LIBRARY DESTINATION ${ARG_DESTINATION} COMPONENT ${MOD_NAME})
 endfunction()
--- a/csrc/attention/merge_attn_states.cu
+++ b/csrc/attention/merge_attn_states.cu
@@ -46,6 +46,32 @@ __global__ void merge_attn_states_kernel(
  s_lse = std::isinf(s_lse) ? -std::numeric_limits<float>::infinity() : s_lse;

  const float max_lse = fmaxf(p_lse, s_lse);
+
+  /* In certain edge cases, MLA can produce p_lse = s_lse = -inf;
+     continuing the pipeline then yields NaN. Root cause: with chunked prefill
+     a batch may be split into two chunks; if a request in that batch has no
+     prefix hit, every LSE entry for that request’s position is -inf, and at
+     this moment we merge cross-attention at first. For now we simply emit
+     prefix_output (expected to be all zeros) and prefix_lse (-inf) to fix
+     this problem.
+  */
+  if (std::isinf(max_lse)) {
+    if (pack_offset < head_size) {
+      // Pack 128b load
+      pack_128b_t p_out_pack = reinterpret_cast<const pack_128b_t*>(
+          prefix_head_ptr)[pack_offset / pack_size];
+
+      // Pack 128b storage
+      reinterpret_cast<pack_128b_t*>(output_head_ptr)[pack_offset / pack_size] =
+          p_out_pack;
+    }
+    // We only need to write to output_lse once per head.
+    if (output_lse != nullptr && pack_idx == 0) {
+      output_lse[head_idx * num_tokens + token_idx] = max_lse;
+    }
+    return;
+  }
+
  p_lse = p_lse - max_lse;
  s_lse = s_lse - max_lse;
  const float p_se = expf(p_lse);
--- a/csrc/mamba/mamba_ssm/selective_scan.h
+++ b/csrc/mamba/mamba_ssm/selective_scan.h
@@ -24,6 +24,8 @@ struct SSMParamsBase {
    int64_t pad_slot_id;

    bool delta_softplus;
+    bool cache_enabled;
+    int block_size;

    index_t A_d_stride;
    index_t A_dstate_stride;
@@ -46,8 +48,9 @@ struct SSMParamsBase {
    index_t out_z_batch_stride;
    index_t out_z_d_stride;
    index_t ssm_states_batch_stride;
-    index_t ssm_states_dim_stride;  
+    index_t ssm_states_dim_stride;
    index_t ssm_states_dstate_stride;
+    index_t cache_indices_stride;

    // Common data pointers.
    void *__restrict__ A_ptr;
@@ -66,6 +69,9 @@ struct SSMParamsBase {
    void *__restrict__ cache_indices_ptr;
    void *__restrict__ has_initial_state_ptr;

+    void *__restrict__ block_idx_first_scheduled_token_ptr;  // (batch,) - first block to write
+    void *__restrict__ block_idx_last_scheduled_token_ptr;   // (batch,) - last block to write
+    void *__restrict__ initial_state_idx_ptr;  // (batch,) - index of the initial state to use
 };


--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@@ -119,7 +119,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {

    const int* cache_indices = params.cache_indices_ptr == nullptr ? nullptr
        : reinterpret_cast<int *>(params.cache_indices_ptr);
-    const int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id];
+    const int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id]; 
    // cache_index == params.pad_slot_id is defined as padding, so we exit early
    if (cache_index == params.pad_slot_id){
        return;
@@ -133,9 +133,18 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
    input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride;
    weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
    input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride;
-    typename Ktraits::state_t *ssm_states = reinterpret_cast<typename Ktraits::state_t *>(params.ssm_states_ptr) + 
-    cache_index * params.ssm_states_batch_stride + 
-    dim_id * kNRows * params.ssm_states_dim_stride;
+
+    typename Ktraits::state_t *ssm_states;
+    if (params.cache_enabled) {
+        // APC mode: ssm_states points to the base, we'll use absolute cache slots later
+        ssm_states = reinterpret_cast<typename Ktraits::state_t *>(params.ssm_states_ptr) +
+            dim_id * kNRows * params.ssm_states_dim_stride;
+    } else {
+        // Non-APC mode: offset by cache_index as before
+        ssm_states = reinterpret_cast<typename Ktraits::state_t *>(params.ssm_states_ptr) +
+            cache_index * params.ssm_states_batch_stride +
+            dim_id * kNRows * params.ssm_states_dim_stride;
+    }
    
    float D_val[kNRows] = {0};
    if (params.D_ptr != nullptr) {
@@ -159,7 +168,22 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
    // }

    constexpr int kChunkSize = kNThreads * kNItems;
-    const int n_chunks = (seqlen + 2048 - 1) / 2048;
+
+    // Use block_size for chunking when APC is enabled, otherwise use 2048 for backwards compatibility
+    const int iteration_chunk_size = params.cache_enabled ? params.block_size : 2048;
+    const int n_chunks = (seqlen + iteration_chunk_size - 1) / iteration_chunk_size;
+
+    const int* batch_cache_indices = cache_indices != nullptr ?
+                                     cache_indices + batch_id * params.cache_indices_stride : nullptr;
+    const int* block_idx_first_scheduled = params.block_idx_first_scheduled_token_ptr != nullptr ?
+                                           reinterpret_cast<const int*>(params.block_idx_first_scheduled_token_ptr) : nullptr;
+    const int* block_idx_last_scheduled = params.block_idx_last_scheduled_token_ptr != nullptr ?
+                                          reinterpret_cast<const int*>(params.block_idx_last_scheduled_token_ptr) : nullptr;
+    const int* initial_state_idx = params.initial_state_idx_ptr != nullptr ?
+                                   reinterpret_cast<const int*>(params.initial_state_idx_ptr) : nullptr;
+
+    const size_t load_cache_slot = params.cache_enabled && batch_cache_indices != nullptr ? batch_cache_indices[initial_state_idx[batch_id]] : cache_index;
+
    for (int chunk = 0; chunk < n_chunks; ++chunk) {
        input_t u_vals[kNRows][kNItems], delta_vals_load[kNRows][kNItems];

@@ -219,7 +243,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
            if constexpr (kIsVariableC) {
                auto &smem_load_weight_C = !kIsVariableB ? smem_load_weight : smem_load_weight1;
                load_weight<Ktraits>(Cvar + state_idx * params.C_dstate_stride, C_vals,
-                    smem_load_weight_C, (seqlen - chunk * kChunkSize) * (1 ));
+                    smem_load_weight_C, (seqlen - chunk * kChunkSize) * (1));
                if constexpr (!kIsVariableB) {
                    #pragma unroll
                    for (int r = 0; r < kNRows; ++r) {
@@ -242,7 +266,6 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                for (int i = 0; i < kNItems; ++i) {
                    thread_data[i] = make_float2(exp2f(delta_vals[r][i] * A_val[r]),
                                                 !kIsVariableB ? delta_u_vals[r][i] : B_vals[i] * delta_u_vals[r][i]);
-                    
                    if (seqlen % (kNItems * kNThreads) != 0) {  // So that the last state is correct
                        if (threadIdx.x * kNItems + i >= seqlen - chunk * kChunkSize) {
                            thread_data[i] = make_float2(1.f, 0.f);
@@ -250,8 +273,24 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                    }
                }
                // Initialize running total
-
-                scan_t running_prefix = chunk > 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.0, has_initial_state ? float(ssm_states[state_idx * params.ssm_states_dstate_stride]): 0.0);
+                scan_t running_prefix;
+                if (chunk > 0) {
+                    running_prefix = smem_running_prefix[state_idx + r * MAX_DSTATE];
+                } else {
+                    // Load initial state
+                    if (params.cache_enabled && has_initial_state && batch_cache_indices != nullptr) {
+                        size_t state_offset = load_cache_slot * params.ssm_states_batch_stride +
+                                             r * params.ssm_states_dim_stride +
+                                             state_idx * params.ssm_states_dstate_stride;
+                        running_prefix = make_float2(1.0, float(ssm_states[state_offset]));
+                    } else if (has_initial_state) {
+                        // Non-APC mode: load from current batch position
+                        running_prefix = make_float2(1.0, float(ssm_states[state_idx * params.ssm_states_dstate_stride]));
+                    } else {
+                        // No initial state
+                        running_prefix = make_float2(1.0, 0.0);
+                    }
+                }

                SSMScanPrefixCallbackOp<weight_t> prefix_op(running_prefix);
                typename Ktraits::BlockScanT(smem_scan).InclusiveScan(
@@ -260,8 +299,25 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                // There's a syncthreads in the scan op, so we don't need to sync here.
                // Unless there's only 1 warp, but then it's the same thread (0) reading and writing.
                if (threadIdx.x == 0) {
-                    smem_running_prefix[state_idx] = prefix_op.running_prefix;
-                    if (chunk == n_chunks - 1) {
+                    smem_running_prefix[state_idx + r * MAX_DSTATE] = prefix_op.running_prefix;
+
+                    // Store state at the end of each chunk when cache is enabled
+                    if (params.cache_enabled && batch_cache_indices != nullptr) {
+
+                        size_t cache_slot;
+                        if (chunk == n_chunks - 1) {
+                            cache_slot = batch_cache_indices[block_idx_last_scheduled[batch_id]];
+                        } else {
+                            cache_slot = batch_cache_indices[block_idx_first_scheduled[batch_id] + chunk];
+                        }
+
+                        size_t state_offset = cache_slot * params.ssm_states_batch_stride +
+                                             r * params.ssm_states_dim_stride +
+                                             state_idx * params.ssm_states_dstate_stride;
+
+                        ssm_states[state_offset] = typename Ktraits::state_t(prefix_op.running_prefix.y);
+                    } else if (!params.cache_enabled && chunk == n_chunks - 1) {
+                        // Non-APC mode: store only final state at current batch position
                        ssm_states[state_idx * params.ssm_states_dstate_stride] = typename Ktraits::state_t(prefix_op.running_prefix.y);
                    }
                }
@@ -274,7 +330,6 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                }
            }
        }
-        
        input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + sequence_start_index * params.out_batch_stride
            + dim_id * kNRows * params.out_d_stride + chunk * kChunkSize;
        __syncthreads();
@@ -346,7 +401,9 @@ template<typename input_t, typename weight_t, typename state_t>
 void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream) {

    #ifndef USE_ROCM
-        if (params.seqlen <= 128) {           
+        if (params.cache_enabled && params.block_size == 1024) {
+            selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
+        } else if (params.seqlen <= 128) {
            selective_scan_fwd_launch<32, 4, input_t, weight_t, state_t>(params, stream);
        } else if (params.seqlen <= 256) {
            selective_scan_fwd_launch<32, 8, input_t, weight_t, state_t>(params, stream);
@@ -358,7 +415,9 @@ void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream) {
            selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream);
        }
    #else
-        if (params.seqlen <= 256) {
+        if (params.cache_enabled && params.block_size == 1024) {
+            selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
+        } else if (params.seqlen <= 256) {
            selective_scan_fwd_launch<64, 4, input_t, weight_t, state_t>(params, stream);
        } else if (params.seqlen <= 512) {
            selective_scan_fwd_launch<64, 8, input_t, weight_t, state_t>(params, stream);
@@ -437,13 +496,17 @@ void set_ssm_params_fwd(SSMParamsBase &params,
                        const std::optional<at::Tensor>& D,
                        const std::optional<at::Tensor>& delta_bias,
                        const torch::Tensor ssm_states,
-                        bool has_z, 
+                        bool has_z,
                        bool delta_softplus,
                        const std::optional<at::Tensor>& query_start_loc,
                        const std::optional<at::Tensor>& cache_indices,
                        const std::optional<at::Tensor>& has_initial_state,
                        bool varlen,
-                        int64_t pad_slot_id) {
+                        int64_t pad_slot_id,
+                        int64_t block_size,
+                        const std::optional<torch::Tensor> &block_idx_first_scheduled_token,
+                        const std::optional<torch::Tensor> &block_idx_last_scheduled_token,
+                        const std::optional<torch::Tensor> &initial_state_idx) {

    // Reset the parameters
    memset(&params, 0, sizeof(params));
@@ -477,6 +540,14 @@ void set_ssm_params_fwd(SSMParamsBase &params,
    params.cache_indices_ptr = cache_indices.has_value() ? cache_indices.value().data_ptr() : nullptr;
    params.has_initial_state_ptr = has_initial_state.has_value() ? has_initial_state.value().data_ptr() : nullptr;

+    // Set cache parameters - cache is enabled if we have direct cache writing params
+    params.cache_enabled = block_idx_first_scheduled_token.has_value();
+    params.block_size = static_cast<int>(block_size);
+
+    // Set direct cache writing pointers
+    params.block_idx_first_scheduled_token_ptr = block_idx_first_scheduled_token.has_value() ? block_idx_first_scheduled_token.value().data_ptr() : nullptr;
+    params.block_idx_last_scheduled_token_ptr = block_idx_last_scheduled_token.has_value() ? block_idx_last_scheduled_token.value().data_ptr() : nullptr;
+    params.initial_state_idx_ptr = initial_state_idx.has_value() ? initial_state_idx.value().data_ptr() : nullptr;

    // All stride are in elements, not bytes.
    params.A_d_stride = A.stride(0);
@@ -504,9 +575,11 @@ void set_ssm_params_fwd(SSMParamsBase &params,
        params.out_d_stride = out.stride(0);

        params.ssm_states_batch_stride = ssm_states.stride(0);
-        params.ssm_states_dim_stride = ssm_states.stride(1);  
+        params.ssm_states_dim_stride = ssm_states.stride(1);
        params.ssm_states_dstate_stride = ssm_states.stride(2);

+        params.cache_indices_stride = cache_indices.has_value() ? cache_indices.value().stride(0) : 0;
+
    }
    else{
        if (!is_variable_B) {
@@ -537,8 +610,10 @@ void set_ssm_params_fwd(SSMParamsBase &params,
        params.out_d_stride = out.stride(1);
        
        params.ssm_states_batch_stride = ssm_states.stride(0);
-        params.ssm_states_dim_stride = ssm_states.stride(1);  
+        params.ssm_states_dim_stride = ssm_states.stride(1);
        params.ssm_states_dstate_stride = ssm_states.stride(2);
+
+        params.cache_indices_stride = cache_indices.has_value() ? cache_indices.value().stride(0) : 0;
    }
 }

@@ -554,7 +629,11 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
                  const torch::Tensor &ssm_states,
                  // used to identify padding entries if cache_indices provided
                  // in case of padding, the kernel will return early
-                  int64_t pad_slot_id) {
+                  int64_t pad_slot_id,
+                  int64_t block_size,
+                  const std::optional<torch::Tensor> &block_idx_first_scheduled_token,
+                  const std::optional<torch::Tensor> &block_idx_last_scheduled_token,
+                  const std::optional<torch::Tensor> &initial_state_idx) {
    auto input_type = u.scalar_type();
    auto weight_type = A.scalar_type();
    TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
@@ -646,7 +725,16 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
        auto cache_indices_ = cache_indices.value();
        TORCH_CHECK(cache_indices_.scalar_type() == at::ScalarType::Int);
        TORCH_CHECK(cache_indices_.is_cuda());
-        CHECK_SHAPE(cache_indices_, batch_size);
+
+        // cache_indices can be either 1D (batch_size,) for non-APC mode
+        // or 2D (batch_size, max_positions) for APC mode
+        const bool is_apc_mode = block_idx_first_scheduled_token.has_value();
+        if (is_apc_mode) {
+            TORCH_CHECK(cache_indices_.dim() == 2, "cache_indices must be 2D for APC mode");
+            TORCH_CHECK(cache_indices_.size(0) == batch_size, "cache_indices first dimension must match batch_size");
+        } else {
+            CHECK_SHAPE(cache_indices_, batch_size);
+        }
    }
   

@@ -686,7 +774,11 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
                       cache_indices,
                       has_initial_state,
                       varlen,
-                       pad_slot_id
+                       pad_slot_id,
+                       block_size,
+                       block_idx_first_scheduled_token,
+                       block_idx_last_scheduled_token,
+                       initial_state_idx
                       );

    
--- a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
+++ b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
@@ -87,30 +87,23 @@ torch::Tensor dynamic_4bit_int_moe_cpu(
  const int64_t g_eff_13 = (group_size != -1) ? group_size : H;
  const int64_t g_eff_2 = (group_size != -1) ? group_size : I;

-  // Per-expert outputs filled in parallel
-  std::vector<torch::Tensor> y_list(E);
-  y_list.resize(E);
+  auto X_all = x_c.index_select(/*dim=*/0, expert_tokens);
+  if (apply_router_weight_on_input) {
+    X_all = X_all.mul(expert_gates.unsqueeze(1));
+  }
+  auto Y_all = at::empty({offsets[E], H}, x_c.options());

  at::parallel_for(0, E, 1, [&](int64_t e_begin, int64_t e_end) {
+    c10::InferenceMode guard;
    for (int64_t e = e_begin; e < e_end; ++e) {
      const int64_t te = counts[e];
      if (te == 0) {
-        y_list[e] = at::empty({0, H}, x_c.options());
        continue;
      }

      const int64_t start = offsets[e];

-      auto sel_tokens =
-          expert_tokens.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
-      auto gates_e =
-          expert_gates.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
-
-      auto x_e = x_c.index_select(/*dim=*/0, sel_tokens);
-
-      if (apply_router_weight_on_input) {
-        x_e = x_e.mul(gates_e.unsqueeze(1));
-      }
+      auto x_e = X_all.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);

      auto w13_e = w13_packed.select(/*dim=*/0, e);
      auto w2_e = w2_packed.select(/*dim=*/0, e);
@@ -137,17 +130,15 @@ torch::Tensor dynamic_4bit_int_moe_cpu(
      // W2
      auto y = mm(act, w2_e, g_eff_2, /*in_features=*/I, /*out_features=*/H);

-      if (!apply_router_weight_on_input) {
-        y = y.mul(gates_e.unsqueeze(1));
-      }
-
      // Store per-expert result
-      y_list[e] = y;
+      Y_all.narrow(/*dim=*/0, /*start=*/start, /*length=*/te).copy_(y);
    }
  });

-  // Concatenate all expert outputs to match expert_tokens order
-  auto Y_all = at::cat(y_list, /*dim=*/0);
+  if (!apply_router_weight_on_input) {
+    Y_all = Y_all.mul(expert_gates.unsqueeze(1));
+  }
+
  auto out = at::zeros({T, H}, x.options());
  out =
      at::index_add(out, /*dim=*/0, /*index=*/expert_tokens, /*source=*/Y_all);
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@@ -427,11 +427,29 @@ __device__ inline bool is_finite(const T val) {
 #endif
 }

+// Scoring function enums
+enum ScoringFunc {
+  SCORING_NONE = 0,    // no activation function
+  SCORING_SIGMOID = 1  // apply sigmoid
+};
+
+// Efficient sigmoid approximation from TensorRT-LLM
+__device__ inline float sigmoid_accurate(float x) {
+  return 0.5f * tanhf(0.5f * x) + 0.5f;
+}
+
 template <typename T>
-__device__ void topk_with_k2(T* output, T const* input,
+__device__ inline T apply_sigmoid(T val) {
+  float f = cuda_cast<float, T>(val);
+  return cuda_cast<T, float>(sigmoid_accurate(f));
+}
+
+template <typename T>
+__device__ void topk_with_k2(T* output, T const* input, T const* bias,
                             cg::thread_block_tile<32> const& tile,
                             int32_t const lane_id,
-                             int const num_experts_per_group) {
+                             int const num_experts_per_group,
+                             int const scoring_func) {
  // Get the top2 per thread
  T largest = neg_inf<T>();
  T second_largest = neg_inf<T>();
@@ -439,6 +457,12 @@ __device__ void topk_with_k2(T* output, T const* input,
  if (num_experts_per_group > WARP_SIZE) {
    for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
      T value = input[i];
+      // Apply scoring function if needed
+      if (scoring_func == SCORING_SIGMOID) {
+        value = apply_sigmoid(value);
+      }
+      value = value + bias[i];
+
      if (value > largest) {
        second_largest = largest;
        largest = value;
@@ -448,7 +472,13 @@ __device__ void topk_with_k2(T* output, T const* input,
    }
  } else {
    for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
-      largest = input[i];
+      T value = input[i];
+      // Apply scoring function if needed
+      if (scoring_func == SCORING_SIGMOID) {
+        value = apply_sigmoid(value);
+      }
+      value = value + bias[i];
+      largest = value;
    }
  }

@@ -472,17 +502,21 @@ __device__ void topk_with_k2(T* output, T const* input,
 }

 template <typename T>
-__global__ void topk_with_k2_kernel(T* output, T* input,
+__global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
                                    int64_t const num_tokens,
                                    int64_t const num_cases,
                                    int64_t const n_group,
-                                    int64_t const num_experts_per_group) {
+                                    int64_t const num_experts_per_group,
+                                    int const scoring_func) {
  int32_t warp_id = threadIdx.x / WARP_SIZE;
  int32_t lane_id = threadIdx.x % WARP_SIZE;

  int32_t case_id = blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;
  if (case_id < num_cases) {
    input += case_id * num_experts_per_group;
+    // bias is per expert group, offset to current group
+    int32_t group_id = case_id % n_group;
+    T const* group_bias = bias + group_id * num_experts_per_group;
    output += case_id;

    cg::thread_block block = cg::this_thread_block();
@@ -491,7 +525,8 @@ __global__ void topk_with_k2_kernel(T* output, T* input,
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
    asm volatile("griddepcontrol.wait;");
 #endif
-    topk_with_k2(output, input, tile, lane_id, num_experts_per_group);
+    topk_with_k2(output, input, group_bias, tile, lane_id,
+                 num_experts_per_group, scoring_func);
  }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
  asm volatile("griddepcontrol.launch_dependents;");
@@ -500,16 +535,15 @@ __global__ void topk_with_k2_kernel(T* output, T* input,

 template <typename T, typename IdxT>
 __global__ void group_idx_and_topk_idx_kernel(
-    T* scores, T const* group_scores, T* topk_values, IdxT* topk_indices,
-    T* scores_with_bias, int64_t const num_tokens, int64_t const n_group,
+    T* scores, T const* group_scores, float* topk_values, IdxT* topk_indices,
+    T const* bias, int64_t const num_tokens, int64_t const n_group,
    int64_t const topk_group, int64_t const topk, int64_t const num_experts,
    int64_t const num_experts_per_group, bool renormalize,
-    double routed_scaling_factor) {
+    double routed_scaling_factor, int scoring_func) {
  int32_t warp_id = threadIdx.x / WARP_SIZE;
  int32_t lane_id = threadIdx.x % WARP_SIZE;
  int32_t case_id =
      blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;  // one per token
-  scores_with_bias += case_id * num_experts;
  scores += case_id * num_experts;
  group_scores += case_id * n_group;
  topk_values += case_id * topk;
@@ -577,10 +611,16 @@ __global__ void group_idx_and_topk_idx_kernel(
        int32_t offset = i_group * num_experts_per_group;
        for (int32_t i = lane_id; i < align_num_experts_per_group;
             i += WARP_SIZE) {
-          T candidates = (i < num_experts_per_group) &&
-                                 is_finite(scores_with_bias[offset + i])
-                             ? scores_with_bias[offset + i]
-                             : neg_inf<T>();
+          T candidates = neg_inf<T>();
+          if (i < num_experts_per_group) {
+            // Apply scoring function (if any) and add bias
+            T input = scores[offset + i];
+            if (is_finite(input)) {
+              T score = (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input)
+                                                          : input;
+              candidates = score + bias[offset + i];
+            }
+          }
          queue.add(candidates, offset + i);
        }
        if (group_scores[i_group] == topk_group_value) {
@@ -602,11 +642,12 @@ __global__ void group_idx_and_topk_idx_kernel(
    for (int i = lane_id;
         i < warp_topk::round_up_to_multiple_of<WARP_SIZE>(topk);
         i += WARP_SIZE) {
-      T value =
-          i < topk
-              ? scores[s_topk_idx[i]]
-              : cuda_cast<T, float>(0.0f);  // Load the valid value of expert
+      T value = cuda_cast<T, float>(0.0f);
      if (i < topk) {
+        // Load the score value (without bias) for normalization
+        T input = scores[s_topk_idx[i]];
+        value =
+            (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input) : input;
        s_topk_value[i] = value;
      }
      topk_sum +=
@@ -627,12 +668,12 @@ __global__ void group_idx_and_topk_idx_kernel(
          value = cuda_cast<float, T>(s_topk_value[i]) * routed_scaling_factor;
        }
        topk_indices[i] = s_topk_idx[i];
-        topk_values[i] = cuda_cast<T, float>(value);
+        topk_values[i] = value;
      }
    } else {
      for (int i = lane_id; i < topk; i += WARP_SIZE) {
        topk_indices[i] = i;
-        topk_values[i] = cuda_cast<T, float>(1.0f / topk);
+        topk_values[i] = 1.0f / topk;
      }
    }
    // Note: when if_proceed_next_topk==false, choose the first 8 experts as the
@@ -644,12 +685,12 @@ __global__ void group_idx_and_topk_idx_kernel(
 }

 template <typename T, typename IdxT>
-void invokeNoAuxTc(T* scores, T* group_scores, T* topk_values,
-                   IdxT* topk_indices, T* scores_with_bias,
-                   int64_t const num_tokens, int64_t const num_experts,
-                   int64_t const n_group, int64_t const topk_group,
-                   int64_t const topk, bool const renormalize,
-                   double const routed_scaling_factor, bool enable_pdl = false,
+void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
+                   IdxT* topk_indices, T const* bias, int64_t const num_tokens,
+                   int64_t const num_experts, int64_t const n_group,
+                   int64_t const topk_group, int64_t const topk,
+                   bool const renormalize, double const routed_scaling_factor,
+                   int const scoring_func, bool enable_pdl = false,
                   cudaStream_t const stream = 0) {
  int64_t num_cases = num_tokens * n_group;
  int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
@@ -664,8 +705,9 @@ void invokeNoAuxTc(T* scores, T* group_scores, T* topk_values,
  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
  config.numAttrs = 1;
  config.attrs = attrs;
-  cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores_with_bias,
-                     num_tokens, num_cases, n_group, num_experts / n_group);
+  cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
+                     num_tokens, num_cases, n_group, num_experts / n_group,
+                     scoring_func);

  int64_t topk_with_k_group_num_blocks =
      (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
@@ -682,19 +724,18 @@ void invokeNoAuxTc(T* scores, T* group_scores, T* topk_values,
  config.numAttrs = 1;
  config.attrs = attrs;
  cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
-                     topk_values, topk_indices, scores_with_bias, num_tokens,
-                     n_group, topk_group, topk, num_experts,
-                     num_experts / n_group, renormalize, routed_scaling_factor);
+                     topk_values, topk_indices, bias, num_tokens, n_group,
+                     topk_group, topk, num_experts, num_experts / n_group,
+                     renormalize, routed_scaling_factor, scoring_func);
 }

 #define INSTANTIATE_NOAUX_TC(T, IdxT)                                       \
  template void invokeNoAuxTc<T, IdxT>(                                     \
-      T * scores, T * group_scores, T * topk_values, IdxT * topk_indices,   \
-      T * scores_with_bias, int64_t const num_tokens,                       \
-      int64_t const num_experts, int64_t const n_group,                     \
-      int64_t const topk_group, int64_t const topk, bool const renormalize, \
-      double const routed_scaling_factor, bool enable_pdl,                  \
-      cudaStream_t const stream);
+      T * scores, T * group_scores, float* topk_values, IdxT* topk_indices, \
+      T const* bias, int64_t const num_tokens, int64_t const num_experts,   \
+      int64_t const n_group, int64_t const topk_group, int64_t const topk,  \
+      bool const renormalize, double const routed_scaling_factor,           \
+      int const scoring_func, bool enable_pdl, cudaStream_t const stream);

 INSTANTIATE_NOAUX_TC(float, int32_t);
 INSTANTIATE_NOAUX_TC(half, int32_t);
@@ -703,28 +744,32 @@ INSTANTIATE_NOAUX_TC(__nv_bfloat16, int32_t);
 }  // namespace vllm

 std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
-    torch::Tensor const& scores, torch::Tensor const& scores_with_bias,
-    int64_t n_group, int64_t topk_group, int64_t topk, bool renormalize,
-    double routed_scaling_factor) {
-  auto data_type = scores_with_bias.scalar_type();
-  auto input_size = scores_with_bias.sizes();
+    torch::Tensor const& scores, int64_t n_group, int64_t topk_group,
+    int64_t topk, bool renormalize, double routed_scaling_factor,
+    torch::Tensor const& bias, int64_t scoring_func = 0) {
+  auto data_type = scores.scalar_type();
+  auto input_size = scores.sizes();
  int64_t num_tokens = input_size[0];
  int64_t num_experts = input_size[1];
-  TORCH_CHECK(input_size.size() == 2, "scores_with_bias must be a 2D Tensor");
+  TORCH_CHECK(input_size.size() == 2, "scores must be a 2D Tensor");
  TORCH_CHECK(num_experts % n_group == 0,
              "num_experts should be divisible by n_group");
  TORCH_CHECK(n_group <= 32,
              "n_group should be smaller than or equal to 32 for now");
  TORCH_CHECK(topk <= 32, "topk should be smaller than or equal to 32 for now");
+  TORCH_CHECK(scoring_func == vllm::moe::SCORING_NONE ||
+                  scoring_func == vllm::moe::SCORING_SIGMOID,
+              "scoring_func must be SCORING_NONE (0) or SCORING_SIGMOID (1)");

  torch::Tensor group_scores = torch::empty(
      {num_tokens, n_group}, torch::dtype(data_type).device(torch::kCUDA));
+  // Always output float32 for topk_values (eliminates Python-side conversion)
  torch::Tensor topk_values = torch::empty(
-      {num_tokens, topk}, torch::dtype(data_type).device(torch::kCUDA));
+      {num_tokens, topk}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
  torch::Tensor topk_indices = torch::empty(
      {num_tokens, topk}, torch::dtype(torch::kInt32).device(torch::kCUDA));

-  auto stream = c10::cuda::getCurrentCUDAStream(scores_with_bias.get_device());
+  auto stream = c10::cuda::getCurrentCUDAStream(scores.get_device());

  switch (data_type) {
    case torch::kFloat16:
@@ -732,11 +777,11 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
      vllm::moe::invokeNoAuxTc<half, int32_t>(
          reinterpret_cast<half*>(scores.mutable_data_ptr()),
          reinterpret_cast<half*>(group_scores.mutable_data_ptr()),
-          reinterpret_cast<half*>(topk_values.mutable_data_ptr()),
+          reinterpret_cast<float*>(topk_values.mutable_data_ptr()),
          reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
-          reinterpret_cast<half*>(scores_with_bias.data_ptr()), num_tokens,
+          reinterpret_cast<half const*>(bias.data_ptr()), num_tokens,
          num_experts, n_group, topk_group, topk, renormalize,
-          routed_scaling_factor, false, stream);
+          routed_scaling_factor, static_cast<int>(scoring_func), false, stream);
      break;
    case torch::kFloat32:
      // Handle Float32
@@ -745,20 +790,20 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
          reinterpret_cast<float*>(group_scores.mutable_data_ptr()),
          reinterpret_cast<float*>(topk_values.mutable_data_ptr()),
          reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
-          reinterpret_cast<float*>(scores_with_bias.data_ptr()), num_tokens,
+          reinterpret_cast<float const*>(bias.data_ptr()), num_tokens,
          num_experts, n_group, topk_group, topk, renormalize,
-          routed_scaling_factor, false, stream);
+          routed_scaling_factor, static_cast<int>(scoring_func), false, stream);
      break;
    case torch::kBFloat16:
      // Handle BFloat16
      vllm::moe::invokeNoAuxTc<__nv_bfloat16, int32_t>(
          reinterpret_cast<__nv_bfloat16*>(scores.mutable_data_ptr()),
          reinterpret_cast<__nv_bfloat16*>(group_scores.mutable_data_ptr()),
-          reinterpret_cast<__nv_bfloat16*>(topk_values.mutable_data_ptr()),
+          reinterpret_cast<float*>(topk_values.mutable_data_ptr()),
          reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
-          reinterpret_cast<__nv_bfloat16*>(scores_with_bias.data_ptr()),
-          num_tokens, num_experts, n_group, topk_group, topk, renormalize,
-          routed_scaling_factor, false, stream);
+          reinterpret_cast<__nv_bfloat16 const*>(bias.data_ptr()), num_tokens,
+          num_experts, n_group, topk_group, topk, renormalize,
+          routed_scaling_factor, static_cast<int>(scoring_func), false, stream);
      break;
    default:
      // Handle other data types
--- a/csrc/moe/moe_lora_align_sum_kernels.cu
+++ b/csrc/moe/moe_lora_align_sum_kernels.cu
@@ -28,11 +28,16 @@ __global__ void moe_lora_align_sum_kernel(
    int64_t block_size, int num_experts, int max_loras, size_t numel,
    int max_num_tokens_padded, int max_num_m_blocks,
    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
-    int topk_num, int32_t* total_tokens_post_pad) {
+    int topk_num, int32_t* total_tokens_post_pad, int32_t* adapter_enabled,
+    int32_t* lora_ids) {
  const size_t tokens_per_thread = div_ceil(numel, blockDim.x);
  const size_t start_idx = threadIdx.x * tokens_per_thread;

-  int lora_id = blockIdx.x;
+  int lora_idx = blockIdx.x;
+  int lora_id = lora_ids[lora_idx];
+  if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
+    return;
+  }
  extern __shared__ int32_t shared_mem[];
  int32_t* cumsum = shared_mem;
  token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + num_experts + 1);
@@ -121,14 +126,13 @@ __global__ void moe_lora_align_sum_kernel(
  }
 }

-void moe_lora_align_block_size(torch::Tensor topk_ids,
-                               torch::Tensor token_lora_mapping,
-                               int64_t num_experts, int64_t block_size,
-                               int64_t max_loras, int64_t max_num_tokens_padded,
-                               int64_t max_num_m_blocks,
-                               torch::Tensor sorted_token_ids,
-                               torch::Tensor expert_ids,
-                               torch::Tensor num_tokens_post_pad) {
+void moe_lora_align_block_size(
+    torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
+    int64_t num_experts, int64_t block_size, int64_t max_loras,
+    int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
+    torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
+    torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
+    torch::Tensor lora_ids) {
  const int topk_num = topk_ids.size(1);

  TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
@@ -164,6 +168,7 @@ void moe_lora_align_block_size(torch::Tensor topk_ids,
            max_loras, topk_ids.numel(), max_num_tokens_padded,
            max_num_m_blocks, sorted_token_ids.data_ptr<int32_t>(),
            expert_ids.data_ptr<int32_t>(), topk_num,
-            num_tokens_post_pad.data_ptr<int32_t>());
+            num_tokens_post_pad.data_ptr<int32_t>(),
+            adapter_enabled.data_ptr<int32_t>(), lora_ids.data_ptr<int32_t>());
      });
 }
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -20,14 +20,13 @@ void batched_moe_align_block_size(int64_t max_tokens_per_batch,
                                  torch::Tensor expert_ids,
                                  torch::Tensor num_tokens_post_pad);

-void moe_lora_align_block_size(torch::Tensor topk_ids,
-                               torch::Tensor token_lora_mapping,
-                               int64_t num_experts, int64_t block_size,
-                               int64_t max_loras, int64_t max_num_tokens_padded,
-                               int64_t max_num_m_blocks,
-                               torch::Tensor sorted_token_ids,
-                               torch::Tensor expert_ids,
-                               torch::Tensor num_tokens_post_pad);
+void moe_lora_align_block_size(
+    torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
+    int64_t num_experts, int64_t block_size, int64_t max_loras,
+    int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
+    torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
+    torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
+    torch::Tensor lora_ids);
 #ifndef USE_ROCM
 torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                             torch::Tensor b_qweight, torch::Tensor b_scales,
@@ -40,9 +39,9 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                             int64_t BLOCK_SIZE_K, int64_t bit);

 std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
-    torch::Tensor const& scores, torch::Tensor const& scores_with_bias,
-    int64_t n_group, int64_t topk_group, int64_t topk, bool renormalize,
-    double routed_scaling_factor);
+    torch::Tensor const& scores, int64_t n_group, int64_t topk_group,
+    int64_t topk, bool renormalize, double routed_scaling_factor,
+    torch::Tensor const& bias, int64_t scoring_func);
 #endif

 bool moe_permute_unpermute_supported();
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -44,7 +44,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "                     int max_num_m_blocks, "
      "                     Tensor !sorted_token_ids,"
      "                     Tensor !experts_ids,"
-      "                     Tensor !num_tokens_post_pad) -> () ");
+      "                     Tensor !num_tokens_post_pad,"
+      "                     Tensor !adapter_enabled,"
+      "                     Tensor !lora_ids) -> () ");
  m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size);

 #ifndef USE_ROCM
@@ -105,9 +107,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {

  // Apply grouped topk routing to select experts.
  m.def(
-      "grouped_topk(Tensor scores, Tensor scores_with_bias, int n_group, int "
+      "grouped_topk(Tensor scores, int n_group, int "
      "topk_group, int topk, bool renormalize, float "
-      "routed_scaling_factor) -> (Tensor, Tensor)");
+      "routed_scaling_factor, Tensor bias, int scoring_func) -> (Tensor, "
+      "Tensor)");
  m.impl("grouped_topk", torch::kCUDA, &grouped_topk);
 #endif
 }
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -321,17 +321,19 @@ void dynamic_per_token_scaled_fp8_quant(
    torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
    std::optional<torch::Tensor> const& scale_ub);

-void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
-                        const torch::Tensor& A, const torch::Tensor& B,
-                        const torch::Tensor& C,
-                        const std::optional<torch::Tensor>& D_,
-                        const std::optional<torch::Tensor>& z_,
-                        const std::optional<torch::Tensor>& delta_bias_,
-                        bool delta_softplus,
-                        const std::optional<torch::Tensor>& query_start_loc,
-                        const std::optional<torch::Tensor>& cache_indices,
-                        const std::optional<torch::Tensor>& has_initial_state,
-                        const torch::Tensor& ssm_states, int64_t pad_slot_id);
+void selective_scan_fwd(
+    const torch::Tensor& u, const torch::Tensor& delta, const torch::Tensor& A,
+    const torch::Tensor& B, const torch::Tensor& C,
+    const std::optional<torch::Tensor>& D_,
+    const std::optional<torch::Tensor>& z_,
+    const std::optional<torch::Tensor>& delta_bias_, bool delta_softplus,
+    const std::optional<torch::Tensor>& query_start_loc,
+    const std::optional<torch::Tensor>& cache_indices,
+    const std::optional<torch::Tensor>& has_initial_state,
+    const torch::Tensor& ssm_states, int64_t pad_slot_id, int64_t block_size,
+    const std::optional<torch::Tensor>& block_idx_first_scheduled_token,
+    const std::optional<torch::Tensor>& block_idx_last_scheduled_token,
+    const std::optional<torch::Tensor>& initial_state_idx);

 torch::Tensor dynamic_4bit_int_moe_cpu(
    torch::Tensor x, torch::Tensor topk_ids, torch::Tensor topk_weights,
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@@ -578,11 +578,13 @@ void persistent_masked_m_silu_mul_quant(

  // This kernel currently only supports H % 128 == 0 and assumes a
  // fixed GROUP_SIZE of 128.
+  static constexpr int GROUP_SIZE = 128;
+
  TORCH_CHECK(input.dtype() == torch::kBFloat16);
  TORCH_CHECK(y_q.dtype() == torch::kFloat8_e4m3fn ||
              y_q.dtype() == torch::kFloat8_e4m3fnuz);
  TORCH_CHECK(y_s.dtype() == torch::kFloat32);
-  TORCH_CHECK(input.size(-1) % 256 == 0);
+  TORCH_CHECK(input.size(-1) % (GROUP_SIZE * 2) == 0);

  using Idx_t = int64_t;

@@ -601,8 +603,6 @@ void persistent_masked_m_silu_mul_quant(

  Idx_t stride_counts_e = tokens_per_expert.stride(0);

-  static constexpr int GROUP_SIZE = 128;
-
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  #define KERNEL(BLOCK_COUNT, USE_UE8M0, THREAD_COUNT, STAGES)                 \
@@ -628,21 +628,26 @@ void persistent_masked_m_silu_mul_quant(

  static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32;

+  int const NUM_GROUPS = H / GROUP_SIZE;
  if (!use_ue8m0) {
-    if (H >= 4096) {
+    if (H >= 4096 && (NUM_GROUPS % 8 == 0)) {
+      /* 8 warps config */
      static constexpr int NUM_STAGES = 4;
      static constexpr int THREAD_COUNT = 256;
      KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, NUM_STAGES);
    } else {
+      /* 1 warp config */
      static constexpr int THREAD_COUNT = 32;
      KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, 2);
    }
  } else {
-    if (H >= 4096) {
+    if (H >= 4096 && (NUM_GROUPS % 8 == 0)) {
+      /* 8 warps config */
      static constexpr int NUM_STAGES = 4;
      static constexpr int THREAD_COUNT = 256;
      KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, NUM_STAGES);
    } else {
+      /* 1 warp config */
      static constexpr int THREAD_COUNT = 32;
      KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, 2);
    }
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -31,6 +31,13 @@

 namespace vllm {

+template <typename Int>
+__host__ __device__ inline Int round_up(Int x, Int y) {
+  static_assert(std::is_integral_v<Int>,
+                "round_up argument must be integral type");
+  return (x + y - 1) / y * y;
+}
+
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
 __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
@@ -42,10 +49,21 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
                "Vec size is not matched.");

+  int sf_m = round_up<int>(numRows, 128);
+  int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
+  int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
+  for (int row = numRows + blockIdx.x; row < sf_m; row += gridDim.x) {
+    // Each thread writes 4 uint32_t elements.
+    for (int col = sf_n_unpadded + threadIdx.x * 4; col < sf_n_int;
+         col += blockDim.x * 4) {
+      SFout[row * sf_n_int + col] = 0x00;
+    }
+  }
+
  // Get the global scaling factor, which will be applied to the SF.
  // Note SFScale is the same as next GEMM's alpha, which is
  // (448.f / (Alpha_A / 6.f)).
-  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
+  float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];

  // Input tensor row/col loops.
  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
@@ -64,7 +82,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
              rowIdx, colIdx, numCols, SFout);

      out_pos =
-          cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+          cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
    }
  }
 }
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu
@@ -1,6 +1,5 @@
 #include "scaled_mm_kernels.hpp"
 #include "scaled_mm_sm100_fp8_dispatch.cuh"
-#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"

 namespace vllm {

@@ -13,11 +12,11 @@ void cutlass_scaled_mm_sm100_fp8(torch::Tensor& out, torch::Tensor const& a,
  if (bias) {
    TORCH_CHECK(bias->dtype() == out.dtype(),
                "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm100_fp8_epilogue<c3x::ScaledEpilogueBias>(
-        out, a, b, a_scales, b_scales, *bias);
+    return cutlass_scaled_mm_sm100_fp8_epilogue<true>(out, a, b, a_scales,
+                                                      b_scales, *bias);
  } else {
-    return cutlass_scaled_mm_sm100_fp8_epilogue<c3x::ScaledEpilogue>(
-        out, a, b, a_scales, b_scales);
+    return cutlass_scaled_mm_sm100_fp8_epilogue<false>(out, a, b, a_scales,
+                                                       b_scales);
  }
 }

--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
@@ -2,6 +2,7 @@

 #include "scaled_mm.cuh"
 #include "cutlass_gemm_caller.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"

 /**
 * This file defines Gemm kernel configurations for SM100 (fp8) based on the
@@ -12,8 +13,88 @@ namespace vllm {

 using c3x::cutlass_gemm_caller;

-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule, bool swap_ab_ = false>
+struct cutlass_3x_gemm_sm100_fp8 {
+  using ElementAB = ElementAB_;
+  using ElementC = ElementD_;
+  using ElementD = ElementD_;
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  static constexpr int AlignmentAB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD =
+      128 / cutlass::sizeof_bits<ElementD>::value;
+
+  // Compile-time swap_ab flag
+  static constexpr bool swap_ab = swap_ab_;
+
+  // -----------------------------------------------------------
+  // Layout definitions
+  // -----------------------------------------------------------
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutA_T = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutB_T = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutD_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutD>::type;
+
+  using LayoutC = LayoutD;
+  using LayoutC_Transpose = LayoutD_Transpose;
+
+  // -----------------------------------------------------------
+  // Collective epilogue (conditionally swap operands and layouts)
+  // -----------------------------------------------------------
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, float, ElementC,
+          conditional_t<swap_ab, LayoutC_Transpose, LayoutC>, AlignmentCD,
+          ElementD, conditional_t<swap_ab, LayoutD_Transpose, LayoutD>,
+          AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // -----------------------------------------------------------
+  // Collective mainloop (conditionally swap operands and layouts)
+  // -----------------------------------------------------------
+  using CollectiveMainloop = conditional_t<
+      swap_ab,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, ElementAB,
+          LayoutB_T, AlignmentAB,             // Swapped B (as A)
+          ElementAB, LayoutA_T, AlignmentAB,  // Swapped A (as B)
+          ElementAcc, TileShape, ClusterShape, Stages,
+          KernelSchedule>::CollectiveOp,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, ElementAB,
+          LayoutA, AlignmentAB, ElementAB, LayoutB, AlignmentAB, ElementAcc,
+          TileShape, ClusterShape, Stages, KernelSchedule>::CollectiveOp>;
+
+  // -----------------------------------------------------------
+  // Kernel definition
+  // -----------------------------------------------------------
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+};
+
+template <typename InType, typename OutType, bool EnableBias>
 struct sm100_fp8_config_default {
  // M in (256, inf)
  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
@@ -22,12 +103,16 @@ struct sm100_fp8_config_default {
  using TileShape = Shape<_256, _128, _128>;
  using ClusterShape = Shape<_2, _2, _1>;
  using Cutlass3xGemm =
-      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
-                            KernelSchedule, EpilogueSchedule>;
+      conditional_t<EnableBias,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogueBias, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogue, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>>;
 };

-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
+template <typename InType, typename OutType, bool EnableBias>
 struct sm100_fp8_config_M256 {
  // M in (64, 256]
  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
@@ -36,44 +121,127 @@ struct sm100_fp8_config_M256 {
  using TileShape = Shape<_128, _128, _128>;
  using ClusterShape = Shape<_2, _1, _1>;
  using Cutlass3xGemm =
-      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
-                            KernelSchedule, EpilogueSchedule>;
+      conditional_t<EnableBias,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogueBias, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogue, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>>;
 };

-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
+template <typename InType, typename OutType, bool EnableBias>
+struct sm100_fp8_config_M64_swap_ab {
+  // This config is for M in (16, 64] and K >= 4096
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_128, _64, _256>;
+  using ClusterShape = Shape<_4, _1, _1>;
+
+  // Use ScaledEpilogueColumnBias instead of ScaledEpilogueBias when doing swap
+  // AB
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm100_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
+                                TileShape, ClusterShape, KernelSchedule,
+                                EpilogueSchedule, true>,
+      cutlass_3x_gemm_sm100_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                                ClusterShape, KernelSchedule, EpilogueSchedule,
+                                true>>;
+};
+
+template <typename InType, typename OutType, bool EnableBias>
 struct sm100_fp8_config_M64 {
-  // M in (16, 64]
+  // This config is for M = 64 and K < 4096 (do not enable swap AB in such case)
  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
  using TileShape = Shape<_64, _64, _128>;
  using ClusterShape = Shape<_1, _1, _1>;
+
  using Cutlass3xGemm =
-      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
-                            KernelSchedule, EpilogueSchedule>;
+      conditional_t<EnableBias,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogueBias, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>,
+                    cutlass_3x_gemm_sm100_fp8<
+                        InType, OutType, c3x::ScaledEpilogue, TileShape,
+                        ClusterShape, KernelSchedule, EpilogueSchedule>>;
 };

-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm100_fp8_config_M16 {
+template <typename InType, typename OutType, bool EnableBias>
+struct sm100_fp8_config_M16_swap_ab {
  // M in [1, 16]
  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
-  using TileShape = Shape<_64, _64, _128>;
-  using ClusterShape = Shape<_1, _4, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
-                            KernelSchedule, EpilogueSchedule>;
+  using TileShape = Shape<_128, _32, _128>;
+  using ClusterShape = Shape<_4, _1, _1>;
+
+  // Use ScaledEpilogueColumnBias instead of ScaledEpilogueBias when doing swap
+  // AB
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm100_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
+                                TileShape, ClusterShape, KernelSchedule,
+                                EpilogueSchedule, true>,
+      cutlass_3x_gemm_sm100_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                                ClusterShape, KernelSchedule, EpilogueSchedule,
+                                true>>;
 };

-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue,
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_gemm_caller_sm100_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                   torch::Tensor const& b,
+                                   EpilogueArgs&&... epilogue_params) {
+  static constexpr bool swap_ab = Gemm::swap_ab;
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+  using GemmKernel = typename Gemm::GemmKernel;
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1);
+  auto prob_shape =
+      swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);
+
+  StrideA a_stride =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+  StrideB b_stride =
+      cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
+  StrideC c_stride = cutlass::make_cute_packed_stride(
+      StrideC{},
+      swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1));
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+
+  typename GemmKernel::MainloopArguments mainloop_args =
+      swap_ab ? typename GemmKernel::MainloopArguments{b_ptr, b_stride, a_ptr,
+                                                       a_stride}
+              : typename GemmKernel::MainloopArguments{a_ptr, a_stride, b_ptr,
+                                                       b_stride};
+
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, c_stride};
+
+  c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
+                                       epilogue_args);
+}
+
+template <typename InType, typename OutType, bool EnableBias,
          typename... EpilogueArgs>
 inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
                                            torch::Tensor const& a,
                                            torch::Tensor const& b,
+                                            torch::Tensor const& a_scales,
+                                            torch::Tensor const& b_scales,
                                            EpilogueArgs&&... args) {
  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
@@ -81,55 +249,69 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,

  using Cutlass3xGemmDefault =
      typename sm100_fp8_config_default<InType, OutType,
-                                        Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM16 =
-      typename sm100_fp8_config_M16<InType, OutType, Epilogue>::Cutlass3xGemm;
+                                        EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM16SwapAB =
+      typename sm100_fp8_config_M16_swap_ab<InType, OutType,
+                                            EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM64SwapAB =
+      typename sm100_fp8_config_M64_swap_ab<InType, OutType,
+                                            EnableBias>::Cutlass3xGemm;
  using Cutlass3xGemmM64 =
-      typename sm100_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+      typename sm100_fp8_config_M64<InType, OutType, EnableBias>::Cutlass3xGemm;
+
  using Cutlass3xGemmM256 =
-      typename sm100_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;
+      typename sm100_fp8_config_M256<InType, OutType,
+                                     EnableBias>::Cutlass3xGemm;

  uint32_t const m = a.size(0);
-  uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
+  uint32_t const k = a.size(1);

-  if (mp2 <= 16) {
+  if (m <= 16) {
    // m in [1, 16]
-    return cutlass_gemm_caller<Cutlass3xGemmM16>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 64) {
+    return cutlass_gemm_caller_sm100_fp8<Cutlass3xGemmM16SwapAB>(
+        out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
+  } else if (m <= 64) {
    // m in (16, 64]
-    return cutlass_gemm_caller<Cutlass3xGemmM64>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 256) {
+    if (m == 64 && k < 4096) {
+      // do not enable swap AB
+      return cutlass_gemm_caller_sm100_fp8<Cutlass3xGemmM64>(
+          out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
+    }
+    return cutlass_gemm_caller_sm100_fp8<Cutlass3xGemmM64SwapAB>(
+        out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
+
+  } else if (m <= 256) {
    // m in (64, 256]
-    return cutlass_gemm_caller<Cutlass3xGemmM256>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
+    return cutlass_gemm_caller_sm100_fp8<Cutlass3xGemmM256>(
+        out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
  } else {
    // m in (256, inf)
-    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
+    return cutlass_gemm_caller_sm100_fp8<Cutlass3xGemmDefault>(
+        out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
  }
 }

-template <template <typename, typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
+template <bool EnableBias, typename... EpilogueArgs>
 void cutlass_scaled_mm_sm100_fp8_epilogue(torch::Tensor& out,
                                          torch::Tensor const& a,
                                          torch::Tensor const& b,
+                                          torch::Tensor const& a_scales,
+                                          torch::Tensor const& b_scales,
                                          EpilogueArgs&&... epilogue_args) {
  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);

  if (out.dtype() == torch::kBFloat16) {
    return cutlass_gemm_sm100_fp8_dispatch<cutlass::float_e4m3_t,
-                                           cutlass::bfloat16_t, Epilogue>(
-        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+                                           cutlass::bfloat16_t, EnableBias>(
+        out, a, b, a_scales, b_scales,
+        std::forward<EpilogueArgs>(epilogue_args)...);
  } else {
    TORCH_CHECK(out.dtype() == torch::kFloat16);
    return cutlass_gemm_sm100_fp8_dispatch<cutlass::float_e4m3_t,
-                                           cutlass::half_t, Epilogue>(
-        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+                                           cutlass::half_t, EnableBias>(
+        out, a, b, a_scales, b_scales,
+        std::forward<EpilogueArgs>(epilogue_args)...);
  }
 }

--- a/Show More
+++ b/Show More
				`@@ -1 +0,0 @@`
				`Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml`