[Core] Rename PassConfig flags as per RFC #27995 (#29646 )

Signed-off-by: arpitkh101 <arpit5khandelwal@gmail.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> (cherry picked from commit d7284a2604)
[BugFix] Fix assert in build_for_cudagraph_capture (#29893 )
2025-12-02 20:38:43 -08:00 · 2025-12-02 16:59:56 -08:00 · 2025-12-02 15:27:01 -08:00 · 2025-12-02 15:08:47 -08:00 · 2025-12-02 15:08:34 -08:00 · 2025-12-02 15:08:24 -08:00
1291 changed files with 78888 additions and 30158 deletions
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -108,6 +108,65 @@ The number of this test is less stable compared to the delay and latency benchma

 WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.

+#### Default Parameters Field
+
+We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
+
+<details>
+<summary> An Example of default parameters field </summary>
+
+```json
+{
+  "defaults": {
+    "qps_list": [
+      "inf"
+    ],
+    "server_environment_variables": {
+      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
+    },
+    "server_parameters": {
+      "tensor_parallel_size": 1,
+      "dtype": "bfloat16",
+      "block_size": 128,
+      "disable_log_stats": "",
+      "load_format": "dummy"
+    },
+    "client_parameters": {
+      "backend": "vllm",
+      "dataset_name": "random",
+      "random-input-len": 128,
+      "random-output-len": 128,
+      "num_prompts": 200,
+      "ignore-eos": ""
+    }
+  },
+  "tests": [
+    {
+      "test_name": "serving_llama3B_tp2_random_128_128",
+      "server_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "tensor_parallel_size": 2,
+      },
+      "client_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+      }
+    },
+    {
+      "test_name": "serving_qwen3_tp4_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-14B",
+        "tensor_parallel_size": 4,
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-14B",
+      }
+    },
+  ]
+}
+```
+
+</details>
+
 ### Visualizing the results

 The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -110,7 +110,8 @@ json2envs() {
 wait_for_server() {
  # wait for vllm server to start
  # return 1 if vllm server crashes
-  timeout 1200 bash -c '
+  local timeout_val="1200"
+  timeout "$timeout_val" bash -c '
    until curl -X POST localhost:8000/v1/completions; do
      sleep 1
    done' && return 0 || return 1
@@ -316,12 +317,44 @@ run_throughput_tests() {
 run_serving_tests() {
  # run serving tests using `vllm bench serve` command
  # $1: a json file specifying serving test cases
+  #
+  # Supported JSON formats:
+  # 1) Plain format: top-level array
+  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+  #
+  # 2) Default parameters field + plain format tests
+  #    {
+  #      "defaults": { ... },
+  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+  #    }

  local serving_test_file
  serving_test_file=$1

  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
+  jq -c '
+    if type == "array" then
+      # Plain format: test cases array
+      .[]
+    elif (type == "object" and has("tests")) then
+      # merge the default parameters into each test cases
+      . as $root
+      | ($root.defaults // {}) as $d
+      | ($root.tests // [])[]
+      # default qps / max_concurrency from defaults if missing
+      | .qps_list = (.qps_list // $d.qps_list)
+      | .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
+      # merge envs / params: test overrides defaults
+      | .server_environment_variables =
+          (($d.server_environment_variables // {}) + (.server_environment_variables // {}))
+      | .server_parameters =
+          (($d.server_parameters // {}) + (.server_parameters // {}))
+      | .client_parameters =
+          (($d.client_parameters // {}) + (.client_parameters // {}))
+    else
+      error("Unsupported serving test file format: must be array or object with .tests")
+    end
+  ' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -335,20 +368,25 @@ run_serving_tests() {
      continue
    fi

-    # get client and server arguments
+    # get client and server arguments (after merged the default parameters)
    server_params=$(echo "$params" | jq -r '.server_parameters')
    server_envs=$(echo "$params" | jq -r '.server_environment_variables')
    client_params=$(echo "$params" | jq -r '.client_parameters')
+
    server_args=$(json2args "$server_params")
    server_envs=$(json2envs "$server_envs")
    client_args=$(json2args "$client_params")
+
+    # qps_list
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
+
+    # max_concurrency_list (fallback to num_prompts if missing)
    max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
    if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
-        num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
-        max_concurrency_list="[$num_prompts]"
+      num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
+      max_concurrency_list="[$num_prompts]"
    fi
    max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
    echo "Running over max concurrency list $max_concurrency_list"
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
@@ -1,610 +0,0 @@
-[
-    {
-        "test_name": "serving_llama8B_bf16_tp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp4_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_tp4_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp4_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp4_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp4_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp4_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    }
-]
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -1,276 +1,246 @@
-[
-    {
-        "test_name": "serving_llama8B_tp1_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 32
-        }
+{
+  "defaults": {
+    "qps_list": [
+      "inf"
+    ],
+    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+    "server_environment_variables": {
+      "VLLM_RPC_TIMEOUT": 100000,
+      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+      "VLLM_CPU_SGL_KERNEL": 1,
+      "VLLM_CPU_KVCACHE_SPACE": 40
    },
-    {
-        "test_name": "serving_llama8B_tp2_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 32
-        }
+    "server_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "tensor_parallel_size": 1,
+      "dtype": "bfloat16",
+      "distributed_executor_backend": "mp",
+      "block_size": 128,
+      "trust_remote_code": "",
+      "disable_log_stats": "",
+      "enforce_eager": "",
+      "max_num_batched_tokens": 2048,
+      "max_num_seqs": 256,
+      "load_format": "dummy"
    },
-    {
-        "test_name": "serving_llama8B_tp1_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 32
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp2_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 32
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp1_random_128_2048",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 2048,
-	    "ignore-eos": "",
-            "num_prompts": 32
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp2_random_128_2048",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 2048,
-	    "ignore-eos": "",
-            "num_prompts": 32
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp1_random_2048_128",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 2048,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 32
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp2_random_2048_128",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [32],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 2048,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 32
-        }
+    "client_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "backend": "vllm",
+      "ignore-eos": "",
+      "num_prompts": 200
    }
-]
+  },
+  "tests": [
+    {
+      "test_name": "serving_llama8B_tp1_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp4_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp4_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp4_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama3B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_granite2B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "ibm-granite/granite-3.2-2b-instruct",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "ibm-granite/granite-3.2-2b-instruct",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen1.7B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-1.7B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-1.7B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen4B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-4B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-4B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen8B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-8B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-8B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_glm9B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "zai-org/glm-4-9b-hf",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "zai-org/glm-4-9b-hf",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_gemma7B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "google/gemma-7b",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "google/gemma-7b",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    }
+  ]
+}
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -8,7 +8,7 @@ steps:
    commands:
      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
@@ -30,19 +30,6 @@ steps:
      DOCKER_BUILDKIT: "1"

  # x86 + CUDA builds
-  - label: "Build wheel - CUDA 12.8"
-    depends_on: ~
-    id: build-wheel-cuda-12-8
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
-    env:
-      DOCKER_BUILDKIT: "1"
-
  - label: "Build wheel - CUDA 12.9"
    depends_on: ~
    id: build-wheel-cuda-12-9
@@ -109,7 +96,6 @@ steps:
  - label: "Annotate release workflow"
    depends_on:
      - create-multi-arch-manifest
-      - build-wheel-cuda-12-8
    id: annotate-release-workflow
    agents:
      queue: cpu_queue_postmerge
@@ -132,7 +118,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -23,8 +23,8 @@ To download the wheel (by version):
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .

-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
 \`\`\`

 To download and upload the image:
@@ -45,9 +45,10 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker push vllm/vllm-openai:latest-aarch64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64

-docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
-docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
+docker manifest rm vllm/vllm-openai:latest
+docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker manifest push vllm/vllm-openai:latest
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
 \`\`\`
-EOF 
+EOF 
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -0,0 +1,369 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# do not complain about line length (for docstring)
+# ruff: noqa: E501
+
+import argparse
+import json
+import re
+import sys
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any
+from urllib.parse import quote
+
+if not sys.version_info >= (3, 12):
+    raise RuntimeError("This script requires Python 3.12 or higher.")
+
+INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
+<html>
+  <meta name="pypi:repository-version" content="1.0">
+  <body>
+{items}
+  </body>
+</html>
+"""
+
+
+@dataclass
+class WheelFileInfo:
+    package_name: str
+    version: str
+    build_tag: str | None
+    python_tag: str
+    abi_tag: str
+    platform_tag: str
+    variant: str | None
+    filename: str
+
+
+def parse_from_filename(file: str) -> WheelFileInfo:
+    """
+    Parse wheel file name to extract metadata.
+
+    The format of wheel names:
+        {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
+    All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
+    Example:
+        vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
+        vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
+        vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
+        vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
+    """
+    wheel_file_re = re.compile(
+        r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
+    )
+    match = wheel_file_re.match(file)
+    if not match:
+        raise ValueError(f"Invalid wheel file name: {file}")
+
+    package_name = match.group("package_name")
+    version = match.group("version")
+    build_tag = match.group("build_tag")
+    python_tag = match.group("python_tag")
+    abi_tag = match.group("abi_tag")
+    platform_tag = match.group("platform_tag")
+
+    # extract variant from version
+    variant = None
+    if "dev" in version:
+        ver_after_dev = version.split("dev")[-1]
+        if "." in ver_after_dev:
+            variant = ver_after_dev.split(".")[-1]
+            version = version.removesuffix("." + variant)
+    else:
+        if "+" in version:
+            version, variant = version.split("+")
+
+    return WheelFileInfo(
+        package_name=package_name,
+        version=version,
+        build_tag=build_tag,
+        python_tag=python_tag,
+        abi_tag=abi_tag,
+        platform_tag=platform_tag,
+        variant=variant,
+        filename=file,
+    )
+
+
+def generate_project_list(subdir_names: list[str]) -> str:
+    """
+    Generate project list HTML content linking to each project & variant sub-directory.
+    """
+    href_tags = []
+    for name in sorted(subdir_names):
+        name = name.strip("/").strip(".")
+        href_tags.append(f'    <a href="{name}/">{name}/</a><br/>')
+    return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
+
+
+def generate_package_index_and_metadata(
+    wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
+) -> tuple[str, str]:
+    """
+    Generate package index HTML content for a specific package, linking to actual wheel files.
+    """
+    href_tags = []
+    metadata = []
+    for file in sorted(wheel_files, key=lambda x: x.filename):
+        relative_path = (
+            wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
+        )
+        # handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
+        # NOTE: this is AWS S3 specific behavior!
+        file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
+        href_tags.append(f'    <a href="{file_path_quoted}">{file.filename}</a><br/>')
+        file_meta = asdict(file)
+        file_meta["path"] = file_path_quoted
+        metadata.append(file_meta)
+    index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
+    metadata_str = json.dumps(metadata, indent=2)
+    return index_str, metadata_str
+
+
+def generate_index_and_metadata(
+    whl_files: list[str],
+    wheel_base_dir: Path,
+    index_base_dir: Path,
+    default_variant: str | None = None,
+    alias_to_default: str | None = None,
+):
+    """
+    Generate index for all wheel files.
+
+    Args:
+        whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
+        wheel_base_dir (Path): Base directory for wheel files.
+        index_base_dir (Path): Base directory to store index files.
+        default_variant (str | None): The default variant name, if any.
+        alias_to_default (str | None): Alias variant name for the default variant, if any.
+
+    First, parse all wheel files to extract metadata.
+    We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
+    The index for the default variant (if any) is generated in the root index directory.
+
+    If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
+    is purely a copy of the corresponding variant index, with only the links adjusted.
+    Otherwise, all wheels without variant suffixes are treated as the default variant.
+
+    If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
+    as the default variant index, but the links are adjusted accordingly.
+
+    Index directory structure:
+        index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
+            index.html  # project list, linking to "vllm/" and other packages, and all variant sub-directories
+            vllm/
+                index.html # package index, pointing to actual files in wheel_base_dir (relative path)
+                metadata.json # machine-readable metadata for all wheels in this package
+            cpu/ # cpu variant sub-directory
+                index.html
+                vllm/
+                    index.html
+                    metadata.json
+            cu129/ # cu129 is actually the alias to default variant
+                index.html
+                vllm/
+                    index.html
+                    metadata.json
+            cu130/ # cu130 variant sub-directory
+                index.html
+                vllm/
+                    index.html
+                    metadata.json
+            ...
+
+    metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
+        [
+            {
+                "package_name": "vllm",
+                "version": "0.10.2rc2",
+                "build_tag": null,
+                "python_tag": "cp38",
+                "abi_tag": "abi3",
+                "platform_tag": "manylinux2014_aarch64",
+                "variant": "cu129",
+                "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
+                "path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
+            },
+            ...
+        ]
+    """
+
+    parsed_files = [parse_from_filename(f) for f in whl_files]
+
+    if not parsed_files:
+        print("No wheel files found, skipping index generation.")
+        return
+
+    # Group by variant
+    variant_to_files: dict[str, list[WheelFileInfo]] = {}
+    for file in parsed_files:
+        variant = file.variant or "default"
+        if variant not in variant_to_files:
+            variant_to_files[variant] = []
+        variant_to_files[variant].append(file)
+
+    print(f"Found variants: {list(variant_to_files.keys())}")
+
+    # sanity check for default variant
+    if default_variant:
+        if "default" in variant_to_files:
+            raise ValueError(
+                "All wheel files must have variant suffixes when `default_variant` is specified."
+            )
+        if default_variant not in variant_to_files:
+            raise ValueError(
+                f"Default variant '{default_variant}' not found among wheel files."
+            )
+
+    if alias_to_default:
+        if "default" not in variant_to_files:
+            # e.g. only some wheels are uploaded to S3 currently
+            print(
+                "[WARN] Alias to default variant specified, but no default variant found."
+            )
+        elif alias_to_default in variant_to_files:
+            raise ValueError(
+                f"Alias variant name '{alias_to_default}' already exists among wheel files."
+            )
+        else:
+            variant_to_files[alias_to_default] = variant_to_files["default"].copy()
+            print(f"Alias variant '{alias_to_default}' created for default variant.")
+
+    # Generate index for each variant
+    subdir_names = set()
+    for variant, files in variant_to_files.items():
+        if variant == "default":
+            variant_dir = index_base_dir
+        else:
+            variant_dir = index_base_dir / variant
+            subdir_names.add(variant)
+
+        variant_dir.mkdir(parents=True, exist_ok=True)
+
+        # gather all package names in this variant
+        packages = set(f.package_name for f in files)
+        if variant == "default":
+            # these packages should also appear in the "project list"
+            # generate after all variants are processed
+            subdir_names = subdir_names.union(packages)
+        else:
+            # generate project list for this variant directly
+            project_list_str = generate_project_list(sorted(packages))
+            with open(variant_dir / "index.html", "w") as f:
+                f.write(project_list_str)
+
+        for package in packages:
+            # filter files belonging to this package only
+            package_files = [f for f in files if f.package_name == package]
+            package_dir = variant_dir / package
+            package_dir.mkdir(parents=True, exist_ok=True)
+            index_str, metadata_str = generate_package_index_and_metadata(
+                package_files, wheel_base_dir, package_dir
+            )
+            with open(package_dir / "index.html", "w") as f:
+                f.write(index_str)
+            with open(package_dir / "metadata.json", "w") as f:
+                f.write(metadata_str)
+
+    # Generate top-level project list index
+    project_list_str = generate_project_list(sorted(subdir_names))
+    with open(index_base_dir / "index.html", "w") as f:
+        f.write(project_list_str)
+
+
+if __name__ == "__main__":
+    """
+    Arguments:
+        --version <version> : version string for the current build (e.g., commit hash)
+        --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
+        --output-dir <output_directory> : directory to store generated index files
+        --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
+    """
+
+    parser = argparse.ArgumentParser(
+        description="Process nightly build wheel files to generate indices."
+    )
+    parser.add_argument(
+        "--version",
+        type=str,
+        required=True,
+        help="Version string for the current build (e.g., commit hash)",
+    )
+    parser.add_argument(
+        "--current-objects",
+        type=str,
+        required=True,
+        help="Path to JSON file containing current S3 objects listing in this version directory",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        required=True,
+        help="Directory to store generated index files",
+    )
+    parser.add_argument(
+        "--alias-to-default",
+        type=str,
+        default=None,
+        help="Alias variant name for the default variant",
+    )
+
+    args = parser.parse_args()
+
+    version = args.version
+    if "/" in version or "\\" in version:
+        raise ValueError("Version string must not contain slashes.")
+    current_objects_path = Path(args.current_objects)
+    output_dir = Path(args.output_dir)
+    if not output_dir.exists():
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Read current objects JSON
+    with open(current_objects_path) as f:
+        current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
+
+    # current_objects looks like from list_objects_v2 S3 API:
+    """
+    "Contents": [
+        {
+            "Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
+            "LastModified": "2025-11-28T14:00:32+00:00",
+            "ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
+            "ChecksumAlgorithm": [
+                "CRC64NVME"
+            ],
+            "ChecksumType": "FULL_OBJECT",
+            "Size": 435649349,
+            "StorageClass": "STANDARD"
+        },
+        ...
+    ]
+    """
+
+    # Extract wheel file keys
+    wheel_files = []
+    for item in current_objects.get("Contents", []):
+        key: str = item["Key"]
+        if key.endswith(".whl"):
+            wheel_files.append(key.split("/")[-1])  # only the filename is used
+
+    print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
+
+    # Generate index and metadata, assuming wheels and indices are stored as:
+    # s3://vllm-wheels/{version}/<wheel files>
+    # s3://vllm-wheels/<anything>/<index files>
+    wheel_base_dir = Path(output_dir).parent / version
+    index_base_dir = Path(output_dir)
+
+    generate_index_and_metadata(
+        whl_files=wheel_files,
+        wheel_base_dir=wheel_base_dir,
+        index_base_dir=index_base_dir,
+        default_variant=None,
+        alias_to_default=args.alias_to_default,
+    )
+    print(f"Successfully generated index and metadata in {output_dir}")
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -59,7 +59,7 @@ while true; do
        fi
 done

-echo "--- Pulling container" 
+echo "--- Pulling container"
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"
@@ -78,17 +78,13 @@ HF_MOUNT="/root/.cache/huggingface"
 commands=$@
 echo "Commands:$commands"

-if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
-  commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
-fi
+commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}

 if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 fi

-if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
-  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
-fi
+commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}

 if [[ $commands == *"pytest -v -s lora"* ]]; then
  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
@@ -181,13 +177,13 @@ if [[ -z "$render_gid" ]]; then
  exit 1
 fi

-# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
+# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
 if [[ $commands == *"--shard-id="* ]]; then
-  # assign job count as the number of shards used   
-  commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
+  # assign job count as the number of shards used
+  commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
    # assign shard-id for each shard
-    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
+    commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
    echo "Shard ${GPU} commands:$commands_gpu"
    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
    docker run \
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# allow to bind to different cores
+CORE_RANGE=${CORE_RANGE:-0-16}
+OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
+
+export CMAKE_BUILD_PARALLEL_LEVEL=16
+
+# Setup cleanup
+remove_docker_container() {
+    set -e;
+    docker rm -f cpu-test || true;
+}
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Try building the docker image
+docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
+
+# Run the image
+docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
+
+function cpu_tests() {
+  set -e
+
+  docker exec cpu-test bash -c "
+    set -e
+    pip list"
+
+  # offline inference
+  docker exec cpu-test bash -c "
+    set -e
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+
+  # Run kernel tests
+  docker exec cpu-test bash -c "
+    set -e
+    pytest -x -v -s tests/kernels/test_onednn.py
+    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
+
+  # basic online serving
+  docker exec cpu-test bash -c '
+    set -e
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
+    server_pid=$!
+    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+    vllm bench serve \
+      --backend vllm \
+      --dataset-name random \
+      --model Qwen/Qwen3-0.6B \
+      --num-prompts 20 \
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
+}
+
+# All of CPU tests are expected to be finished less than 40 mins.
+export -f cpu_tests
+timeout 2h bash -c cpu_tests
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -25,20 +25,22 @@ function cpu_tests() {

  # offline inference
  podman exec -it "$container_id" bash -c "
+    export TORCH_COMPILE_DISABLE=1
    set -xve
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log

  # Run basic model test
  podman exec -it "$container_id" bash -c "
+    export TORCH_COMPILE_DISABLE=1
    set -evx
    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
-    pip install sentence-transformers datamodel_code_generator
+    pip install sentence-transformers datamodel_code_generator tblib 

    # Note: disable Bart until supports V1
    # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
-    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
-    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
-    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-openai-community/gpt2]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-facebook/opt-125m]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
    # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -21,8 +21,8 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
@@ -49,6 +49,7 @@ function cpu_tests() {
  # Run kernel tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
+    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
    pytest -x -v -s tests/kernels/test_onednn.py"

  # Run basic model test
@@ -72,12 +73,11 @@ function cpu_tests() {
    pytest -x -s -v \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"

-  # Note: disable it until supports V1
-  # Run AWQ test
-  # docker exec cpu-test-"$NUMA_NODE" bash -c "
-  #   set -e
-  #   VLLM_USE_V1=0 pytest -x -s -v \
-  #   tests/quantization/test_ipex_quant.py"
+  # Run AWQ/GPTQ test
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -x -s -v \
+    tests/quantization/test_cpu_wna16.py"

  # Run multi-lora tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
@@ -116,4 +116,4 @@ function cpu_tests() {

 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -35,7 +35,7 @@ docker run \
    echo $ZE_AFFINITY_MASK
    pip install tblib==3.1.0
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
    VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
@@ -46,6 +46,6 @@ docker run \
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
    pytest -v -s v1/structured_output
    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
-    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
+    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
    pytest -v -s v1/test_serial_utils.py
 '
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -17,7 +17,17 @@ wait_for_server() {
 }

 MODEL="deepseek-ai/DeepSeek-V2-lite"
-BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+  # ROCm platform
+  BACKENDS=("allgather_reducescatter")
+  # Disable MOE padding for ROCm since it is causing eplb to fail
+  export VLLM_ROCM_MOE_PADDING=0
+else
+  # Non-ROCm platform (CUDA/other)
+  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi

 cleanup() {
  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
@@ -1,10 +1,12 @@
 #!/usr/bin/env bash
 set -euxo pipefail

-# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] [DATA_PARALLEL_SIZE] [TENSOR_PARALLEL_SIZE]
 THRESHOLD=${1:-0.8}
 NUM_Q=${2:-1319}
 PORT=${3:-8020}
+DATA_PARALLEL_SIZE=${4:-2}
+TENSOR_PARALLEL_SIZE=${5:-2}
 OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
 mkdir -p "${OUT_DIR}"

@@ -17,7 +19,16 @@ wait_for_server() {
 }

 MODEL="QWen/Qwen3-30B-A3B-FP8"
-BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+  # ROCm platform
+  BACKENDS=("allgather_reducescatter")
+  # Disable MOE padding for ROCm since it is causing eplb to fail
+  export VLLM_ROCM_MOE_PADDING=0
+else
+  # Non-ROCm platform (CUDA/other)
+  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi

 cleanup() {
  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
@@ -36,8 +47,10 @@ for BACK in "${BACKENDS[@]}"; do
  VLLM_ALL2ALL_BACKEND=$BACK \
  vllm serve "$MODEL" \
    --enforce-eager \
-    --tensor-parallel-size 2 \
-    --data-parallel-size 2 \
+    --enable-eplb \
+    --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
+    --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
+    --data-parallel-size ${DATA_PARALLEL_SIZE} \
    --enable-expert-parallel \
    --trust-remote-code \
    --max-model-len 2048 \
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -2,6 +2,28 @@

 set -ex

+# ======== part 0: setup ========
+
+BUCKET="vllm-wheels"
+INDICES_OUTPUT_DIR="indices"
+DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
+PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
+SUBPATH=$BUILDKITE_COMMIT
+S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
+
+# detect if python3.10+ is available
+has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
+if [[ "$has_new_python" -eq 0 ]]; then
+    # use new python from docker
+    docker pull python:3-slim
+    PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
+fi
+
+echo "Using python interpreter: $PYTHON"
+echo "Python version: $($PYTHON --version)"
+
+# ========= part 1: collect, rename & upload the wheel ==========
+
 # Assume wheels are in artifacts/dist/*.whl
 wheel_files=(artifacts/dist/*.whl)

@@ -10,74 +32,69 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
  echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
  exit 1
 fi
-
-# Get the single wheel file
 wheel="${wheel_files[0]}"

-# Detect architecture and rename 'linux' to appropriate manylinux version
-arch=$(uname -m)
-if [[ $arch == "x86_64" ]]; then
-    manylinux_version="manylinux1"
-elif [[ $arch == "aarch64" ]]; then
-    manylinux_version="manylinux2014"
-else
-    echo "Warning: Unknown architecture $arch, using manylinux1 as default"
-    manylinux_version="manylinux1"
-fi
+# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
+# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
+manylinux_version="manylinux_2_31"

 # Rename 'linux' to the appropriate manylinux version in the wheel filename
+if [[ "$wheel" != *"linux"* ]]; then
+  echo "Error: Wheel filename does not contain 'linux': $wheel"
+  exit 1
+fi
 new_wheel="${wheel/linux/$manylinux_version}"
 mv -- "$wheel" "$new_wheel"
 wheel="$new_wheel"
+echo "Renamed wheel to: $wheel"

 # Extract the version from the wheel
 version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-echo "Version: $version"
+echo "Version in wheel: $version"
+pure_version="${version%%+*}"
+echo "Pure version (without variant): $pure_version"

-normal_wheel="$wheel" # Save the original wheel filename
+# copy wheel to its own bucket
+aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"

-# If the version contains "dev", rename it to v1.0.0.dev for consistency
-if [[ $version == *dev* ]]; then
-    suffix="${version##*.}"
-    if [[ $suffix == cu* ]]; then
-        new_version="1.0.0.dev+${suffix}"
-    else
-        new_version="1.0.0.dev"
-    fi
-    new_wheel="${wheel/$version/$new_version}"
-    # use cp to keep both files in the artifacts directory
-    cp -- "$wheel" "$new_wheel"
-    wheel="$new_wheel"
-    version="$new_version"
-fi
+# ========= part 2: generate and upload indices ==========
+# generate indices for all existing wheels in the commit directory
+# this script might be run multiple times if there are multiple variants being built
+# so we need to guarantee there is little chance for "TOCTOU" issues
+# i.e., one process is generating indices while another is uploading a new wheel
+# so we need to ensure no time-consuming operations happen below

-# Upload the wheel to S3
-python3 .buildkite/generate_index.py --wheel "$normal_wheel"
+# list all wheels in the commit directory
+echo "Existing wheels on S3:"
+aws s3 ls "$S3_COMMIT_PREFIX"
+obj_json="objects.json"
+aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
+mkdir -p "$INDICES_OUTPUT_DIR"

-# generate index for this commit
-aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-
-if [[ $normal_wheel == *"cu129"* ]]; then
-    # only upload index.html for cu129 wheels (default wheels) as it
-    # is available on both x86 and arm64
-    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
-    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+# call script to generate indicies for all existing wheels
+# this indices have relative paths that could work as long as it is next to the wheel directory in s3
+# i.e., the wheels are always in s3://vllm-wheels/<commit>/
+# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
+if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
+    alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
 else
-    echo "Skipping index files for non-cu129 wheels"
+    alias_arg=""
 fi

-# generate index for nightly
-aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg

-if [[ $normal_wheel == *"cu129"* ]]; then
-    # only upload index.html for cu129 wheels (default wheels) as it
-    # is available on both x86 and arm64
-    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
-else
-    echo "Skipping index files for non-cu129 wheels"
+# copy indices to /<commit>/ unconditionally
+echo "Uploading indices to $S3_COMMIT_PREFIX"
+aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
+
+# copy to /nightly/ only if it is on the main branch and not a PR 
+if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
+    echo "Uploading indices to overwrite /nightly/"
+    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
 fi

-aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
-aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
+# copy to /<pure_version>/ only if it does not have "dev" in the version
+if [[ "$version" != *"dev"* ]]; then
+    echo "Uploading indices to overwrite /$pure_version/"
+    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
+fi
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -39,9 +39,9 @@ steps:
  # if this test fails, it means the nightly torch version is not compatible with some
  # of the dependencies. Please check the error message and add the package to whitelist
  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
  agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
  soft_fail: true
  source_file_dependencies:
  - requirements/nightly_torch_test.txt
@@ -50,9 +50,9 @@ steps:

 - label: Async Engine, Inputs, Utils, Worker Test # 10min
  timeout_in_minutes: 15
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
  agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
  source_file_dependencies:
  - vllm/
  - tests/multimodal
@@ -61,25 +61,29 @@ steps:
  - pytest -v -s -m 'not cpu_test' multimodal
  - pytest -v -s utils_

- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
-  timeout_in_minutes: 10
-  mirror_hardwares: [amdexperimental, amdproduction]
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
  agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
  source_file_dependencies:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
  - tests/standalone_tests/lazy_imports.py
+  - tests/tokenizers_
  - tests/transformers_utils
+  - tests/config
  no_gpu: true
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s tokenizers_
  - pytest -v -s transformers_utils
+  - pytest -v -s config

 - label: Python-only Installation Test # 10min
  timeout_in_minutes: 20
@@ -111,9 +115,9 @@ steps:
  - pytest -v -s basic_correctness/test_cpu_offload.py

 - label: Entrypoints Unit Tests # 5min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
  agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
  timeout_in_minutes: 10
  working_dir: "/vllm-workspace/tests"
  fast_check: true
@@ -187,7 +191,7 @@ steps:
  - tests/distributed/test_utils
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
-  - tests/compile/test_basic_correctness
+  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
@@ -210,12 +214,13 @@ steps:
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py
-  - pytest -v -s compile/test_basic_correctness.py
+  - pytest -v -s compile/fullgraph/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
  - pytest -v -s distributed/test_symm_mem_allreduce.py
@@ -226,10 +231,31 @@ steps:
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd

- label: EPLB Algorithm Test # 5min
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
+- label: Distributed Tests (8 GPUs) # 4min
+  timeout_in_minutes: 10
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_8
  # grade: Blocking
+  gpu: h100
+  num_gpus: 8
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - examples/offline_inference/torchrun_dp_example.py
+  - vllm/config/parallel.py
+  - vllm/distributed/
+  - vllm/v1/engine/llm_engine.py
+  - vllm/v1/executor/uniproc_executor.py
+  - vllm/v1/worker/gpu_worker.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  #- export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and dp=4 with ep
+  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+
+- label: EPLB Algorithm Test # 5min
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  agent_pool: mi325_1
+  grade: Blocking
  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
@@ -238,11 +264,11 @@ steps:
  commands:
  - pytest -v -s distributed/test_eplb_algo.py

- label: EPLB Execution Test # 5min
+- label: EPLB Execution Test # 10min
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
-  timeout_in_minutes: 15
+  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
@@ -250,6 +276,7 @@ steps:
  - tests/distributed/test_eplb_execute.py
  commands:
  - pytest -v -s distributed/test_eplb_execute.py
+  - pytest -v -s distributed/test_eplb_spec_decode.py

 - label: Metrics, Tracing Test # 12min
  timeout_in_minutes: 20
@@ -273,7 +300,7 @@ steps:

 - label: Regression Test # 7min
  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
  agent_pool: mi325_1
  grade: Blocking
  source_file_dependencies:
@@ -284,23 +311,20 @@ steps:
  - pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional

- label: Engine Test # 25min
-  timeout_in_minutes: 40
+- label: Engine Test # 9min
+  timeout_in_minutes: 15
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
-  #grade: Blocking
+  # grade: Blocking
  source_file_dependencies:
  - vllm/
  - tests/engine
-  - tests/tokenization
  - tests/test_sequence
  - tests/test_config
  - tests/test_logger
  - tests/test_vllm_port
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
-  # OOM in the CI unless we run this separately
-  - pytest -v -s tokenization

 - label: V1 Test e2e + engine # 30min
  timeout_in_minutes: 45
@@ -318,9 +342,9 @@ steps:

 - label: V1 Test entrypoints # 35min
  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
  agent_pool: mi325_1
-  # grade: Blocking
+  grade: Blocking
  source_file_dependencies:
    - vllm/
    - tests/v1
@@ -337,6 +361,7 @@ steps:
    - tests/v1
  commands:
    # split the test to avoid interference
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - pytest -v -s -m 'not cpu_test' v1/core
    - pytest -v -s v1/executor
    - pytest -v -s v1/kv_offload
@@ -344,18 +369,56 @@ steps:
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
    - pytest -v -s v1/spec_decode
-    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_lmcache_integration.py
+    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
    - pytest -v -s -m 'not cpu_test' v1/metrics
    - pytest -v -s v1/test_oracle.py
    - pytest -v -s v1/test_request.py
+    - pytest -v -s v1/test_outputs.py
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

- label: V1 Test others (CPU) # 5 mins
-  mirror_hardwares: [amdexperimental, amdproduction]
+# TODO: Add the "V1 Test attetion (MI300)" test group
+
+- label: V1 Test attention (H100) # 10min
+  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
+  timeout_in_minutes: 30
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - pytest -v -s v1/attention
+
+- label: Batch Invariance Tests (H100) # 10min
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  timeout_in_minutes: 25
+  gpu: h100
+  source_file_dependencies:
+    - vllm/
+    - tests/v1/determinism/
+  commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pip install pytest-timeout pytest-forked
+    - pytest -v -s v1/determinism/test_batch_invariance.py
+    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+
+- label: V1 Test attention (B200) # 10min
+  timeout_in_minutes: 30
+  gpu: b200
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
+
+- label: V1 Test others (CPU) # 5 mins
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  agent_pool: mi325_1
+  grade: Blocking
  source_file_dependencies:
    - vllm/
    - tests/v1
@@ -447,7 +510,7 @@ steps:

 - label: PyTorch Compilation Unit Tests # 15min
  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  torch_nightly: true
@@ -455,33 +518,15 @@ steps:
    - vllm/
    - tests/compile
  commands:
-    - pytest -v -s compile/test_pass_manager.py
-    - pytest -v -s compile/test_fusion.py
-    - pytest -v -s compile/test_fusion_attn.py
-    - pytest -v -s compile/test_functionalization.py
-    - pytest -v -s compile/test_silu_mul_quant_fusion.py
-  #  - pytest -v -s compile/test_sequence_parallelism.py
-  #  - pytest -v -s compile/test_async_tp.py
-    - pytest -v -s compile/test_fusion_all_reduce.py
-    - pytest -v -s compile/test_decorator.py
-    - pytest -v -s compile/test_noop_elimination.py
-    - pytest -v -s compile/test_aot_compile.py
+  # Run unit tests defined directly under compile/,
+  # not including subdirectories, which are usually heavier
+  # tests covered elsewhere.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"

 - label: PyTorch Fullgraph Smoke Test # 15min
  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_1
-  # grade: Blocking
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/
-  - tests/compile
-  commands:
-  - pytest -v -s compile/test_basic_correctness.py
-  - pytest -v -s compile/piecewise/
-
- label: PyTorch Fullgraph Test # 22min
-  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
@@ -490,8 +535,39 @@ steps:
  - vllm/
  - tests/compile
  commands:
-  - pytest -v -s compile/test_full_graph.py
-  - pytest -v -s compile/test_fusions_e2e.py
+  # Run smoke tests under fullgraph directory, except test_full_graph.py
+  # as it is a heavy test that is covered in other steps.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
+
+- label: PyTorch Fullgraph Test # 27min
+  timeout_in_minutes: 40
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+    # Limit to no custom ops to reduce running time
+    # Wrap with quotes to escape yaml and avoid starting -k string with a -
+  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+
+- label: Cudagraph test
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  source_file_dependencies:
+  - tests/v1/cudagraph
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/config/compilation.py
+  - vllm/compilation
+  commands:
+    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py

 - label: Kernels Core Operation Test # 48min
  timeout_in_minutes: 75
@@ -507,7 +583,7 @@ steps:

 - label: Kernels Attention Test %N # 23min
  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_8
  # grade: Blocking
  source_file_dependencies:
@@ -534,7 +610,7 @@ steps:

 - label: Kernels MoE Test %N # 40min
  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_8
  # grade: Blocking
  source_file_dependencies:
@@ -543,6 +619,8 @@ steps:
  - tests/kernels/moe
  - vllm/model_executor/layers/fused_moe/
  - vllm/distributed/device_communicators/
+  - vllm/envs.py
+  - vllm/config
  commands:
    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2
@@ -559,12 +637,35 @@ steps:
  commands:
    - pytest -v -s kernels/mamba

+- label: Kernels DeepGEMM Test (H100) # Nvidia-centric
+# Not replicating for CUTLAS & CuTe
+  timeout_in_minutes: 45
+  gpu: h100
+  num_gpus: 1
+  source_file_dependencies:
+  - tools/install_deepgemm.sh
+  - vllm/utils/deep_gemm.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization/test_block_fp8.py
+  - tests/kernels/moe/test_deepgemm.py
+  - tests/kernels/moe/test_batched_deepgemm.py
+  - tests/kernels/attention/test_deepgemm_attention.py
+  commands:
+    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s kernels/moe/test_deepgemm.py
+    - pytest -v -s kernels/moe/test_batched_deepgemm.py
+    - pytest -v -s kernels/attention/test_deepgemm_attention.py
+
 - label: Model Executor Test # 23min
  timeout_in_minutes: 35
+  torch_nightly: true
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
+  - vllm/engine/arg_utils.py
+  - vllm/config/model.py
  - vllm/model_executor
  - tests/model_executor
  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
@@ -638,7 +739,7 @@ steps:
  - vllm/model_executor/models/whisper.py
  commands: # LMEval
  # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
-  - pytest -s entrypoints/openai/correctness/  --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py
+  - pytest -s entrypoints/openai/correctness/

 - label: OpenAI-Compatible Tool Use # 23 min
  timeout_in_minutes: 35
@@ -687,6 +788,7 @@ steps:
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
+  - vllm/transformers_utils/
  - tests/models/test_initialization.py
  commands:
    # Only when vLLM model source is modified - test initialization of a large
@@ -860,9 +962,10 @@ steps:
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work

 - label: Multi-Modal Accuracy Eval (Small Models) # 10min
+  timeout_in_minutes: 70
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
-  timeout_in_minutes: 15
+  # grade: Blocking
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - vllm/multimodal/
@@ -933,16 +1036,17 @@ steps:
 - label: Transformers Nightly Models Test
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
+  # grade: Blocking
  working_dir: "/vllm-workspace/"
  optional: true
  commands:
    - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py
+    - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
    - pytest -v -s tests/models/test_transformers.py
-    - pytest -v -s tests/models/multimodal/processing/
-    - pytest -v -s tests/models/multimodal/test_mapping.py
+    # - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
    - python3 examples/offline_inference/basic/chat.py
-    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

@@ -960,11 +1064,16 @@ steps:
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/attention/backends/mla/cutlass_mla.py
+  - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/platforms/cuda.py
+  - vllm/attention/selector.py
  commands:
    - nvidia-smi
    - python3 examples/offline_inference/basic/chat.py
    # Attention
    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+    - pytest -v -s tests/kernels/attention/test_attention_selector.py
    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
@@ -981,8 +1090,9 @@ steps:
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py

- label: Blackwell Fusion Tests # 30 min
+- label: Blackwell Fusion and Compile Tests # 30 min
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
  gpu: b200
@@ -990,23 +1100,58 @@ steps:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - vllm/model_executor/layers/fused_moe/layer.py
+  - tests/compile/test_fusion_attn.py
+  - tests/compile/test_silu_mul_quant_fusion.py
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/fullgraph/test_full_graph.py
+  commands:
+    - nvidia-smi
+    - pytest -v -s tests/compile/test_fusion_attn.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # Wrap with quotes to escape yaml
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+
+- label: Blackwell Fusion E2E Tests # 30 min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/
  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
-    - pytest -v -s tests/compile/test_fusion_attn.py
-    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
-    # this runner has 2 GPUs available even though num_gpus=2 is not set
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/test_fusions_e2e.py
+    # Run all e2e fusion tests
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py

- label: Blackwell GPT-OSS Eval
+- label: ROCm GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  agent_pool: mi325_1
+  mirror_hardwares: [amdexperimental, amdproduction]
  optional: true # run on nightlies
  source_file_dependencies:
  - tests/evals/gpt_oss
@@ -1015,7 +1160,7 @@ steps:
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58

 - label: Blackwell Quantized MoE Test
  timeout_in_minutes: 60
@@ -1105,7 +1250,7 @@ steps:
  - vllm/worker/worker_base.py
  - vllm/v1/engine/
  - vllm/v1/worker/
-  - tests/compile/test_basic_correctness.py
+  - tests/compile/fullgraph/test_basic_correctness.py
  - tests/compile/test_wrapper.py
  - tests/distributed/
  - tests/entrypoints/llm/test_collective_rpc.py
@@ -1115,10 +1260,11 @@ steps:
  - tests/v1/worker/test_worker_memory_snapshot.py
  commands:
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
-  - pytest -v -s ./compile/test_basic_correctness.py
+  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
@@ -1150,7 +1296,7 @@ steps:

 - label: Plugin Tests (2 GPUs) # 40min
  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_2
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
@@ -1218,12 +1364,15 @@ steps:
    - pytest -v -s -x lora/test_llama_tp.py
    - pytest -v -s -x lora/test_llm_with_multi_loras.py
    - pytest -v -s -x lora/test_olmoe_tp.py
-    - pytest -v -s -x lora/test_gptoss_tp.py
+
+    # Disabled for now because MXFP4 backend on non-cuda platform 
+    # doesn't support LoRA yet
+    #- pytest -v -s -x lora/test_gptoss_tp.py


 - label: Weight Loading Multiple GPU Test  # 33min
  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_2
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
@@ -1233,7 +1382,7 @@ steps:
  - vllm/
  - tests/weight_loading
  commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt

 - label: Weight Loading Multiple GPU Test - Large Models # optional
  mirror_hardwares: [amdexperimental]
@@ -1241,17 +1390,17 @@ steps:
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
-  gpu: a100
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
  commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt

 - label: NixlConnector PD accuracy tests (Distributed) # 30min
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
+  # grade: Blocking
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
@@ -1266,6 +1415,9 @@ steps:
 ##### A100 test #####

 - label: Distributed Tests (A100) # optional
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
  gpu: a100
  optional: true
  num_gpus: 4
@@ -1280,6 +1432,9 @@ steps:
  - pytest -v -s -x lora/test_mixtral.py

 - label: LM Eval Large Models # optional
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
  gpu: a100
  optional: true
  num_gpus: 4
@@ -1291,19 +1446,41 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

+##### H100 test #####
+- label: LM Eval Large Models (H100) # optional
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
 ##### H200 test #####
 - label: Distributed Tests (H200) # optional
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_2
+  # grade: Blocking
  gpu: h200
  optional: true
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
-    - pytest -v -s tests/compile/test_async_tp.py
-    - pytest -v -s tests/compile/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+    - pytest -v -s tests/compile/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    - pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - pytest -v -s tests/v1/distributed/test_dbo.py

 ##### B200 test #####
 - label: Distributed Tests (B200) # optional
@@ -1314,6 +1491,7 @@ steps:
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+    - pytest -v -s tests/v1/distributed/test_dbo.py

 ##### RL Integration Tests #####
 - label: Prime-RL Integration Test # 15min
@@ -1329,3 +1507,36 @@ steps:
  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
    - bash .buildkite/scripts/run-prime-rl-test.sh
+
+- label: DeepSeek V2-Lite Accuracy
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+  timeout_in_minutes: 60
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -25,6 +25,7 @@
 #     and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
 # working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
 # source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
+# autorun_on_main (bool): default to false, if true, the test will run automatically when commit is pushed to main branch.

 # When adding a test
 # - If the test belongs to an existing group, add it there
@@ -56,22 +57,26 @@ steps:
  - pytest -v -s -m 'not cpu_test' multimodal
  - pytest -v -s utils_

- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
-  timeout_in_minutes: 10
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
+  timeout_in_minutes: 20
  source_file_dependencies:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
  - tests/standalone_tests/lazy_imports.py
+  - tests/tokenizers_
  - tests/transformers_utils
+  - tests/config
  no_gpu: true
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s tokenizers_
  - pytest -v -s transformers_utils
+  - pytest -v -s config

 - label: Python-only Installation Test # 10min
  timeout_in_minutes: 20
@@ -164,7 +169,7 @@ steps:
  - tests/distributed/test_utils
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
-  - tests/compile/test_basic_correctness
+  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
@@ -189,12 +194,13 @@ steps:
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py
-  - pytest -v -s compile/test_basic_correctness.py
+  - pytest -v -s compile/fullgraph/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
  - pytest -v -s distributed/test_symm_mem_allreduce.py
@@ -272,21 +278,18 @@ steps:
  - pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional

- label: Engine Test # 25min
-  timeout_in_minutes: 40
+- label: Engine Test # 9min
+  timeout_in_minutes: 15
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
  - tests/engine
-  - tests/tokenization
  - tests/test_sequence
  - tests/test_config
  - tests/test_logger
  - tests/test_vllm_port
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
-  # OOM in the CI unless we run this separately
-  - pytest -v -s tokenization

 - label: V1 Test e2e + engine # 30min
  timeout_in_minutes: 45
@@ -329,6 +332,7 @@ steps:
    - pytest -v -s -m 'not cpu_test' v1/metrics
    - pytest -v -s v1/test_oracle.py
    - pytest -v -s v1/test_request.py
+    - pytest -v -s v1/test_outputs.py
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
@@ -342,6 +346,18 @@ steps:
  commands:
    - pytest -v -s v1/attention

+- label: Batch Invariance Tests (H100) # 10min
+  timeout_in_minutes: 25
+  gpu: h100
+  source_file_dependencies:
+    - vllm/
+    - tests/v1/determinism/
+  commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pip install pytest-timeout pytest-forked
+    - pytest -v -s v1/determinism/test_batch_invariance.py
+    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+
 - label: V1 Test attention (B200) # 10min
  timeout_in_minutes: 30
  gpu: b200
@@ -441,16 +457,12 @@ steps:
    - vllm/
    - tests/compile
  commands:
-    - pytest -v -s compile/test_config.py
-    - pytest -v -s compile/test_pass_manager.py
-    - pytest -v -s compile/test_fusion.py
-    - pytest -v -s compile/test_fusion_attn.py
-    - pytest -v -s compile/test_functionalization.py
-    - pytest -v -s compile/test_silu_mul_quant_fusion.py
-    - pytest -v -s compile/test_fusion_all_reduce.py
-    - pytest -v -s compile/test_decorator.py
-    - pytest -v -s compile/test_noop_elimination.py
-    - pytest -v -s compile/test_aot_compile.py
+  # Run unit tests defined directly under compile/,
+  # not including subdirectories, which are usually heavier
+  # tests covered elsewhere.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"

 - label: PyTorch Fullgraph Smoke Test # 15min
  timeout_in_minutes: 30
@@ -460,12 +472,14 @@ steps:
  - vllm/
  - tests/compile
  commands:
-  - pytest -v -s compile/test_basic_correctness.py
-  - pytest -v -s compile/test_multimodal_compile.py
-  - pytest -v -s compile/piecewise/
+  # Run smoke tests under fullgraph directory, except test_full_graph.py
+  # as it is a heavy test that is covered in other steps.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"

- label: PyTorch Fullgraph Test # 22min
-  timeout_in_minutes: 35
+- label: PyTorch Fullgraph Test # 27min
+  timeout_in_minutes: 40
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@@ -473,10 +487,10 @@ steps:
  - tests/compile
  commands:
    # fp8 kv scales not supported on sm89, tested on Blackwell instead
-  - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
    # Limit to no custom ops to reduce running time
    # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"

 - label: Cudagraph test
  timeout_in_minutes: 20
@@ -548,6 +562,25 @@ steps:
  commands:
    - pytest -v -s kernels/mamba

+- label: Kernels DeepGEMM Test (H100)
+  timeout_in_minutes: 45
+  gpu: h100
+  num_gpus: 1
+  source_file_dependencies:
+  - tools/install_deepgemm.sh
+  - vllm/utils/deep_gemm.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization/test_block_fp8.py
+  - tests/kernels/moe/test_deepgemm.py
+  - tests/kernels/moe/test_batched_deepgemm.py
+  - tests/kernels/attention/test_deepgemm_attention.py
+  commands:
+    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s kernels/moe/test_deepgemm.py
+    - pytest -v -s kernels/moe/test_batched_deepgemm.py
+    - pytest -v -s kernels/attention/test_deepgemm_attention.py
+
 - label: Model Executor Test # 23min
  timeout_in_minutes: 35
  torch_nightly: true
@@ -598,6 +631,7 @@ steps:
  # we can only upgrade after this is resolved
  # TODO(jerryzh168): resolve the above comment
  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py

 - label: LM Eval Small Models # 53min
@@ -606,6 +640,7 @@ steps:
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
+  autorun_on_main: true
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1

@@ -657,6 +692,7 @@ steps:
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
+  - vllm/transformers_utils/
  - tests/models/test_initialization.py
  commands:
    # Only when vLLM model source is modified - test initialization of a large
@@ -783,14 +819,24 @@ steps:
  commands:
    - pytest -v -s models/language/pooling_mteb_test

- label: Multi-Modal Processor Test # 44min
+- label: Multi-Modal Processor Test (CPU)
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  no_gpu: true
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Processor Test
  timeout_in_minutes: 60
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing
+    - pytest -v -s models/multimodal/processing/test_tensor_schema.py

 - label: Multi-Modal Models Test (Standard) # 60min
  timeout_in_minutes: 80
@@ -867,14 +913,15 @@ steps:
 - label: Transformers Nightly Models Test
  working_dir: "/vllm-workspace/"
  optional: true
+  soft_fail: true
  commands:
    - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
+    - pytest -v -s tests/models/test_initialization.py
    - pytest -v -s tests/models/test_transformers.py
-    # - pytest -v -s tests/models/multimodal/processing/
-    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
+    - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/test_mapping.py
    - python3 examples/offline_inference/basic/chat.py
-    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

@@ -892,11 +939,16 @@ steps:
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/attention/backends/mla/cutlass_mla.py
+  - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/platforms/cuda.py
+  - vllm/attention/selector.py
  commands:
    - nvidia-smi
    - python3 examples/offline_inference/basic/chat.py
    # Attention
    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+    - pytest -v -s tests/kernels/attention/test_attention_selector.py
    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
@@ -913,8 +965,9 @@ steps:
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py

- label: Blackwell Fusion & Compile Tests # 30 min
+- label: Blackwell Fusion and Compile Tests # 30 min
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
  gpu: b200
@@ -922,22 +975,29 @@ steps:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
  - vllm/compilation/
  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusion_attn.py
+  - tests/compile/test_silu_mul_quant_fusion.py
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
    - pytest -v -s tests/compile/test_fusion_attn.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_gpus=2 is not set
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
    # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

 - label: Blackwell Fusion E2E Tests # 30 min
  timeout_in_minutes: 40
@@ -954,12 +1014,11 @@ steps:
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/test_fusions_e2e.py
-  - tests/compile/test_full_graph.py
+  - tests/compile/distributed/test_fusions_e2e.py
  commands:
    - nvidia-smi
    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/test_fusions_e2e.py
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py

 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
@@ -1057,7 +1116,7 @@ steps:
  - vllm/worker/worker_base.py
  - vllm/v1/engine/
  - vllm/v1/worker/
-  - tests/compile/test_basic_correctness.py
+  - tests/compile/fullgraph/test_basic_correctness.py
  - tests/compile/test_wrapper.py
  - tests/distributed/
  - tests/entrypoints/llm/test_collective_rpc.py
@@ -1069,10 +1128,11 @@ steps:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
-  - pytest -v -s ./compile/test_basic_correctness.py
+  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
@@ -1252,11 +1312,11 @@ steps:
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
-    - pytest -v -s tests/compile/test_async_tp.py
-    - pytest -v -s tests/compile/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'"
-    - pytest -v -s tests/distributed/test_sequence_parallel.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
    - pytest -v -s tests/v1/distributed/test_dbo.py
@@ -1293,11 +1353,20 @@ steps:
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010

- label: Qwen3-30B-A3B-FP8-block Accuracy
+- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
  timeout_in_minutes: 60
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+  timeout_in_minutes: 60
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -3,12 +3,13 @@

 # This lists cover the "core" components of vLLM that require careful review
 /vllm/attention @LucasWilkinson
-/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
+/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
 /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
+/vllm/model_executor/layers/batch_invariant.py @yewentao256 
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
@@ -20,27 +21,30 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson

 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
-/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
-/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
+/vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
+/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345

 # vLLM V1
 /vllm/v1/attention @LucasWilkinson
 /vllm/v1/attention/backends/mla @pavanimajety
 /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
-/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
+/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
 /vllm/v1/sample @22quinn @houseroad @njhill
 /vllm/v1/spec_decode @benchislett @luccafong
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/kv_cache_interface.py @heheda12345
 /vllm/v1/offloading @ApostaC

+# Model runner V2
+/vllm/v1/worker/gpu @WoosukKwon
+
 # Test ownership
-/.buildkite/lm-eval-harness @mgoin @simon-mo
+/.buildkite/lm-eval-harness @mgoin 
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @aarnphm @NickLucche
 /tests/evals @mgoin
 /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
@@ -49,18 +53,29 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
-/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
+/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep
 /tests/v1/kv_connector/nixl_integration @NickLucche
 /tests/v1/kv_connector @ApostaC
 /tests/v1/offloading @ApostaC
+/tests/v1/determinism @yewentao256 

-# Transformers backend
+# Transformers modeling backend
 /vllm/model_executor/models/transformers @hmellor
 /tests/models/test_transformers.py @hmellor

+# Observability
+/vllm/config/observability.py @markmc
+/vllm/v1/metrics @markmc
+/tests/v1/metrics @markmc
+/vllm/tracing.py @markmc
+/tests/v1/tracing/test_tracing.py @markmc
+/vllm/config/kv_events.py @markmc
+/vllm/distributed/kv_events.py @markmc
+/tests/distributed/test_events.py @markmc
+
 # Docs
 /docs/mkdocs @hmellor
 /docs/**/*.yml @hmellor
@@ -134,6 +149,7 @@ mkdocs.yaml @hmellor
 /examples/*/pooling/ @noooop
 /tests/models/*/pooling* @noooop
 /tests/entrypoints/pooling @noooop
+/vllm/entrypoints/pooling @aarnphm @chaunceyjiang @noooop
 /vllm/config/pooler.py @noooop
 /vllm/pooling_params.py @noooop
 /vllm/model_executor/layers/pooler.py @noooop
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -151,6 +151,23 @@ pull_request_rules:
      add:
        - gpt-oss

+- name: label-nvidia
+  description: Automatically apply nvidia label
+  conditions:
+    - label != stale
+    - or:
+      - files~=cuda
+      - files~=cutlass
+      - files~=flashinfer
+      - files~=trtllm
+      - title~=(?i)NVIDIA
+      - title~=(?i)CUDA
+      - title~=(?i)CUTLASS
+  actions:
+    label:
+      add:
+        - nvidia
+
 - name: label-rocm
  description: Automatically apply rocm label
  conditions:
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -13,7 +13,7 @@ jobs:

    steps:
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0

      - name: Set up Python
        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@@ -105,6 +105,31 @@ jobs:
                  }
                ],
              },
+              cpu: {
+                // Keyword search - matches whole words only (with word boundaries)
+                keywords: [
+                  {
+                    term: "CPU Backend",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "x86",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "ARM",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "Apple Silicon",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "IBM Z",
+                    searchIn: "title"
+                  },
+                ],
+              },
              // Add more label configurations here as needed
              // example: {
              //   keywords: [...],
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -0,0 +1,80 @@
+name: macOS Apple Silicon Smoke Test
+
+on:
+  push:
+    branches:
+      - main
+  workflow_dispatch:  # Manual trigger
+
+jobs:
+  macos-m1-smoke-test:
+    runs-on: macos-latest
+    timeout-minutes: 30
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+          cache-dependency-glob: |
+            requirements/**/*.txt
+            pyproject.toml
+          python-version: '3.12'
+
+      - name: Create virtual environment
+        run: |
+          uv venv
+          echo "$GITHUB_WORKSPACE/.venv/bin" >> "$GITHUB_PATH"
+
+      - name: Install dependencies and build vLLM
+        run: |
+          uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
+          uv pip install -e .
+        env:
+          CMAKE_BUILD_PARALLEL_LEVEL: 4
+
+      - name: Verify installation
+        run: |
+          python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
+
+      - name: Smoke test vllm serve
+        run: |
+          # Start server in background
+          vllm serve Qwen/Qwen3-0.6B \
+            --max-model-len=2K \
+            --load-format=dummy \
+            --hf-overrides '{"num_hidden_layers": 2}' \
+            --enforce-eager \
+            --port 8000 &
+
+          SERVER_PID=$!
+
+          # Wait for server to start
+          for i in {1..30}; do
+            if curl -s http://localhost:8000/health > /dev/null; then
+              echo "Server started successfully"
+              break
+            fi
+            if [ "$i" -eq 30 ]; then
+              echo "Server failed to start"
+              kill "$SERVER_PID"
+              exit 1
+            fi
+            sleep 2
+          done
+
+          # Test health endpoint
+          curl -f http://localhost:8000/health
+
+          # Test completion
+          curl -f http://localhost:8000/v1/completions \
+            -H "Content-Type: application/json" \
+            -d '{
+              "model": "Qwen/Qwen3-0.6B",
+              "prompt": "Hello",
+              "max_tokens": 5
+            }'
+
+          # Cleanup
+          kill "$SERVER_PID"
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -16,7 +16,7 @@ jobs:
  pre-commit:
    runs-on: ubuntu-latest
    steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
    - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
      with:
        python-version: "3.12"
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,9 @@
 # vllm-flash-attn built from source
 vllm/vllm_flash_attn/*

+# OpenAI triton kernels copied from source
+vllm/third_party/triton_kernels/*
+
 # triton jit
 .triton

--- a/.markdownlint.yaml
+++ b/.markdownlint.yaml
@@ -3,10 +3,9 @@ MD007:
 MD013: false
 MD024:
  siblings_only: true
+MD031:
+  list_items: false
 MD033: false
-MD045: false
 MD046: false
-MD051: false
 MD052: false
-MD053: false
 MD059: false
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,6 +39,13 @@ set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")

+# ROCm installation prefix. Default to /opt/rocm but allow override via
+# -DROCM_PATH=/your/rocm/path when invoking cmake.
+if(NOT DEFINED ROCM_PATH)
+  set(ROCM_PATH "/opt/rocm" CACHE PATH "ROCm installation prefix")
+else()
+  set(ROCM_PATH ${ROCM_PATH} CACHE PATH "ROCm installation prefix" FORCE)
+endif()
 #
 # Supported/expected torch versions for CUDA/ROCm.
 #
@@ -129,7 +136,7 @@ elseif(HIP_FOUND)

  # ROCm 5.X and 6.X
  if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
-      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
+      Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM})
    message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
      "expected for ROCm build, saw ${Torch_VERSION} instead.")
  endif()
@@ -237,10 +244,27 @@ set_gencode_flags_for_srcs(
  SRCS "${VLLM_CUMEM_EXT_SRC}"
  CUDA_ARCHS "${CUDA_ARCHS}")

-if(VLLM_GPU_LANG STREQUAL "CUDA")
+if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
  message(STATUS "Enabling cumem allocator extension.")
-  # link against cuda driver library
-  list(APPEND CUMEM_LIBS CUDA::cuda_driver)
+  if(VLLM_GPU_LANG STREQUAL "CUDA")
+    # link against cuda driver library
+    list(APPEND CUMEM_LIBS CUDA::cuda_driver)
+  else()
+    # link against rocm driver library. Prefer an absolute path to
+    # libamdhip64.so inside ${ROCM_PATH}/lib if available, otherwise fall
+    # back to linking by name "amdhip64".
+    find_library(AMDHIP64_LIB
+      NAMES amdhip64 libamdhip64.so
+      PATHS ${ROCM_PATH}/lib
+      NO_DEFAULT_PATH)
+    if(AMDHIP64_LIB)
+      message(STATUS "Found libamdhip64 at ${AMDHIP64_LIB}")
+      list(APPEND CUMEM_LIBS ${AMDHIP64_LIB})
+    else()
+      message(WARNING "libamdhip64 not found in ${ROCM_PATH}/lib; falling back to linking 'amdhip64' by name")
+      list(APPEND CUMEM_LIBS amdhip64)
+    endif()
+  endif()
  define_extension_target(
    cumem_allocator
    DESTINATION vllm
@@ -265,6 +289,7 @@ set(VLLM_EXT_SRC
  "csrc/pos_encoding_kernels.cu"
  "csrc/activation_kernels.cu"
  "csrc/layernorm_kernels.cu"
+  "csrc/fused_qknorm_rope_kernel.cu"
  "csrc/layernorm_quant_kernels.cu"
  "csrc/sampler.cu"
  "csrc/cuda_view.cu"
@@ -282,7 +307,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")

  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v4.2.1")

  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -329,8 +354,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # Only build Marlin kernels if we are building for at least some compatible archs.
  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
  # are not supported by Machete yet.
-  # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
+
+  # marlin arches for fp16 output
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+  # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
+  cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+  # marlin arches for fp8 input
+  # - sm80 doesn't support fp8 computation
+  # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
+  # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
+  cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
+
  if (MARLIN_ARCHS)

    #
@@ -340,16 +374,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(MARLIN_GEN_SCRIPT
      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
    file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
+    list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
+    set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")

-    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
-    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")

-    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
-        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
+    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
      execute_process(
        COMMAND ${CMAKE_COMMAND} -E env
        PYTHONPATH=$PYTHONPATH
-          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
+          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
        RESULT_VARIABLE marlin_generation_result
        OUTPUT_VARIABLE marlin_generation_result
        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
@@ -362,15 +398,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                            "\nCheck the log for details: "
                            "${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
      else()
-        set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
-            CACHE STRING "Last run Marlin generate script hash" FORCE)
+        set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+            CACHE STRING "Last run Marlin generate script hash and arch" FORCE)
        message(STATUS "Marlin generation completed successfully.")
      endif()
    else()
      message(STATUS "Marlin generation script has not changed, skipping generation.")
    endif()

-    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
+    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
    set_gencode_flags_for_srcs(
      SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
      CUDA_ARCHS "${MARLIN_ARCHS}")
@@ -378,12 +414,34 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
    endif()
-
    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})

+    file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
+      CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+      set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
+        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    endif()
+    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
+
+    if (MARLIN_FP8_ARCHS) 
+      file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
+        CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_TEMPLATE_FP8_KERNEL_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_FP8_KERNEL_SRC})
+    endif()
+
    set(MARLIN_SRCS
       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
       "csrc/quantization/gptq_marlin/gptq_marlin.cu"
+       "csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
       "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
       "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
    set_gencode_flags_for_srcs(
@@ -487,9 +545,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
  # require CUDA 12.8 or later
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
  else()
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS
@@ -579,12 +637,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
      "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
-      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
+      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
+      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
+      "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${FP4_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
    message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
  else()
    message(STATUS "Not building NVFP4 as no compatible archs were found.")
@@ -594,9 +655,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

  # FP4 Archs and flags
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
  else()
-    cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
    set(SRCS
@@ -670,7 +731,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
  else()
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
@@ -716,9 +777,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()

  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
  else()
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
@@ -836,7 +897,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()

  # Hadacore kernels
-  cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(HADACORE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
  if(HADACORE_ARCHS)
    set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
    set_gencode_flags_for_srcs(
@@ -913,8 +974,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    CUDA_ARCHS "${CUDA_ARCHS}")

  list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
-  # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
+  # moe marlin arches
+  # note that we always set `use_atomic_add=False` for moe marlin now,
+  # so we don't need 9.0 for bf16 atomicAdd PTX
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+  # moe marlin arches for fp8 input
+  # - sm80 doesn't support fp8 computation
+  # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
+  # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
+  cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
  if (MARLIN_MOE_ARCHS)

    #
@@ -924,16 +992,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(MOE_MARLIN_GEN_SCRIPT
      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
    file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
+    list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
+    set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MOE_MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")

-    message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
-    message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Marlin MOE generation script hash with arch: ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+    message(STATUS "Last run Marlin MOE generate script hash with arch: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")

-    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
-        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
+    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
      execute_process(
        COMMAND ${CMAKE_COMMAND} -E env
        PYTHONPATH=$PYTHONPATH
-          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
+          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
        RESULT_VARIABLE moe_marlin_generation_result
        OUTPUT_VARIABLE moe_marlin_generation_output
        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
@@ -946,7 +1016,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                            "\nCheck the log for details: "
                            "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
      else()
-        set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
+        set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
            CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
        message(STATUS "Marlin MOE generation completed successfully.")
      endif()
@@ -954,16 +1024,28 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
    endif()

-    file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
+    file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
+    list(APPEND MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
    set_gencode_flags_for_srcs(
-      SRCS "${MOE_WNAA16_MARLIN_SRC}"
+      SRCS "${MARLIN_MOE_SRC}"
      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
+      set_source_files_properties(${MARLIN_MOE_SRC}
        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
    endif()
+    list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})

-    list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
+    if (MARLIN_MOE_FP8_ARCHS)
+      file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_MOE_FP8_SRC}"
+        CUDA_ARCHS "${MARLIN_MOE_FP8_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_MOE_FP8_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
+    endif()

    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
  else()
@@ -1005,6 +1087,11 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
    WITH_SOABI)
 endif()

+# For CUDA and HIP builds also build the triton_kernels external package.
+if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
+    include(cmake/external_projects/triton_kernels.cmake)
+endif()
+
 # For CUDA we also build and ship some external projects.
 if (VLLM_GPU_LANG STREQUAL "CUDA")
    include(cmake/external_projects/flashmla.cmake)
--- a/README.md
+++ b/README.md
@@ -21,6 +21,8 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio

 *Latest News* 🔥

+- [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing).
+- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
 - [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
 - [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
 - [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -83,7 +83,7 @@ MIN_CACHE_HIT_PCT=0
 MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
 ```

-#### 2. Maximize Throughput with a Latency Requirement
+### 2. Maximize Throughput with a Latency Requirement

 - **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
 - **Configuration**:
@@ -96,7 +96,7 @@ MIN_CACHE_HIT_PCT=0
 MAX_LATENCY_ALLOWED_MS=500
 ```

-#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
+### 3. Maximize Throughput with Prefix Caching and Latency Requirements

 - **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
 - **Configuration**:
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -620,7 +620,7 @@ def get_tokenizer(
        kwargs["use_fast"] = False
    if tokenizer_mode == "mistral":
        try:
-            from vllm.transformers_utils.tokenizer import MistralTokenizer
+            from vllm.tokenizers import MistralTokenizer
        except ImportError as e:
            raise ImportError(
                "MistralTokenizer requires vllm package.\n"
--- a/benchmarks/benchmark_batch_invariance.py
+++ b/benchmarks/benchmark_batch_invariance.py
@@ -0,0 +1,380 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark to measure the performance overhead of VLLM_BATCH_INVARIANT mode.
+
+This benchmark runs the same workload twice:
+1. With VLLM_BATCH_INVARIANT=0 (baseline)
+2. With VLLM_BATCH_INVARIANT=1 (batch invariant mode)
+
+And reports the timing and throughput metrics for comparison.
+
+Environment variables:
+    VLLM_BENCH_MODEL: Model to benchmark (default: "Qwen/Qwen3-1.7B")
+    VLLM_BENCH_TP_SIZE: Tensor parallel size (default: 1, use 8 for deepseek)
+    VLLM_BENCH_BATCH_SIZE: Max batch size (default: 128)
+    VLLM_BENCH_NUM_TRIALS: Number of trials to run (default: 5)
+    VLLM_BENCH_MIN_PROMPT: Min prompt length in words (default: 1024)
+    VLLM_BENCH_MAX_PROMPT: Max prompt length in words (default: 2048)
+    VLLM_BENCH_MAX_TOKENS: Max tokens to generate (default: 128)
+    VLLM_BENCH_TEMPERATURE: Temperature for sampling (default: 0.0)
+    VLLM_BENCH_GPU_MEMORY_UTILIZATION: GPU memory utilization (default: 0.4)
+    VLLM_BENCH_MAX_MODEL_LEN: Max model length (default: 5120)
+    VLLM_BENCH_BACKEND: Attention backend (default: FLASH_ATTN)
+
+Example usage:
+    # Benchmark qwen3 (default)
+    python benchmarks/benchmark_batch_invariance.py
+
+    # Benchmark deepseek with 8 GPUs
+    VLLM_BENCH_MODEL="deepseek-ai/DeepSeek-V3" VLLM_BENCH_TP_SIZE=8 \\
+        python benchmarks/benchmark_batch_invariance.py
+
+    # Quick test with fewer trials
+    VLLM_BENCH_NUM_TRIALS=2 VLLM_BENCH_BATCH_SIZE=32 \\
+        python benchmarks/benchmark_batch_invariance.py
+"""
+
+import contextlib
+import os
+import random
+import time
+
+from vllm import LLM, SamplingParams
+from vllm.platforms import current_platform
+
+
+def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
+    """Generate a random prompt for benchmarking."""
+    prompt_templates = [
+        "Question: What is the capital of France?\nAnswer: The capital of France is",
+        "Q: How does photosynthesis work?\nA: Photosynthesis is the process by which",
+        "User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is",
+        "Once upon a time in a distant galaxy, there lived",
+        "The old man walked slowly down the street, remembering",
+        "In the year 2157, humanity finally discovered",
+        "To implement a binary search tree in Python, first we need to",
+        "The algorithm works by iterating through the array and",
+        "Here's how to optimize database queries using indexing:",
+        "The Renaissance was a period in European history that",
+        "Climate change is caused by several factors including",
+        "The human brain contains approximately 86 billion neurons which",
+        "I've been thinking about getting a new laptop because",
+        "Yesterday I went to the store and bought",
+        "My favorite thing about summer is definitely",
+    ]
+
+    base_prompt = random.choice(prompt_templates)
+
+    if max_words < min_words:
+        max_words = min_words
+    target_words = random.randint(min_words, max_words)
+
+    if target_words > 50:
+        padding_text = (
+            " This is an interesting topic that deserves more explanation. "
+            * (target_words // 50)
+        )
+        base_prompt = base_prompt + padding_text
+
+    return base_prompt
+
+
+def run_benchmark_with_batch_invariant(
+    model: str,
+    tp_size: int,
+    max_batch_size: int,
+    num_trials: int,
+    min_prompt: int,
+    max_prompt: int,
+    max_tokens: int,
+    temperature: float,
+    gpu_mem_util: float,
+    max_model_len: int,
+    backend: str,
+    batch_invariant: bool,
+    seed: int = 12345,
+) -> dict:
+    """
+    Run the benchmark with the specified configuration.
+
+    Returns a dict with timing and throughput metrics.
+    """
+    random.seed(seed)
+
+    # Set environment variables
+    os.environ["VLLM_ATTENTION_BACKEND"] = backend
+    if batch_invariant:
+        os.environ["VLLM_BATCH_INVARIANT"] = "1"
+    else:
+        os.environ["VLLM_BATCH_INVARIANT"] = "0"
+
+    print(f"\n{'=' * 80}")
+    print(f"BENCHMARK: VLLM_BATCH_INVARIANT={int(batch_invariant)}")
+    print(f"  Model: {model}")
+    print(f"  TP Size: {tp_size}")
+    print(f"  Backend: {backend}")
+    print(f"  Max Batch Size: {max_batch_size}")
+    print(f"  Trials: {num_trials}")
+    print(f"  Max Tokens: {max_tokens}")
+    print(f"{'=' * 80}\n")
+
+    sampling = SamplingParams(
+        temperature=temperature,
+        top_p=0.95,
+        max_tokens=max_tokens,
+        seed=20240919,
+    )
+
+    needle_prompt = "There once was a "
+
+    llm = None
+    try:
+        # Create LLM engine
+        start_init = time.perf_counter()
+        llm = LLM(
+            model=model,
+            max_num_seqs=max_batch_size,
+            gpu_memory_utilization=gpu_mem_util,
+            max_model_len=max_model_len,
+            dtype="bfloat16",
+            tensor_parallel_size=tp_size,
+            enable_prefix_caching=False,
+        )
+        init_time = time.perf_counter() - start_init
+        print(f"Engine initialization time: {init_time:.2f}s\n")
+
+        # Generate baseline
+        print("Generating baseline (warmup)...")
+        baseline_out = llm.generate([needle_prompt], sampling)
+        assert len(baseline_out) == 1
+        baseline_text = baseline_out[0].outputs[0].text
+        print(f"Baseline output: '{baseline_text[:50]}...'\n")
+
+        # Run trials and measure timing
+        trial_times: list[float] = []
+        total_tokens = 0
+        total_prompts = 0
+
+        for trial in range(num_trials):
+            # Create a batch
+            prompts: list[str] = []
+            batch_size = random.randint(max_batch_size // 2, max_batch_size)
+            needle_pos = random.randint(0, batch_size - 1)
+            for i in range(batch_size):
+                if i == needle_pos:
+                    prompts.append(needle_prompt)
+                else:
+                    prompts.append(_random_prompt(min_prompt, max_prompt))
+
+            # Measure time for this trial
+            start_time = time.perf_counter()
+            outputs = llm.generate(prompts, sampling)
+            trial_time = time.perf_counter() - start_time
+
+            trial_times.append(trial_time)
+            total_prompts += len(prompts)
+
+            # Count tokens
+            for output in outputs:
+                if output.outputs:
+                    total_tokens += len(output.outputs[0].token_ids)
+
+            print(
+                f"Trial {trial + 1}/{num_trials}: "
+                f"batch_size={batch_size}, "
+                f"time={trial_time:.2f}s"
+            )
+
+            # Verify needle output still matches
+            needle_output = outputs[needle_pos]
+            assert needle_output.prompt == needle_prompt
+
+        # Compute statistics
+        avg_time = sum(trial_times) / len(trial_times)
+        min_time = min(trial_times)
+        max_time = max(trial_times)
+        throughput = total_tokens / sum(trial_times)
+        prompts_per_sec = total_prompts / sum(trial_times)
+
+        print(f"\n{'=' * 80}")
+        print("RESULTS:")
+        print(f"  Average time per trial: {avg_time:.2f}s")
+        print(f"  Min time: {min_time:.2f}s")
+        print(f"  Max time: {max_time:.2f}s")
+        print(f"  Total tokens generated: {total_tokens}")
+        print(f"  Total prompts processed: {total_prompts}")
+        print(f"  Throughput: {throughput:.2f} tokens/s")
+        print(f"  Prompts/s: {prompts_per_sec:.2f}")
+        print(f"{'=' * 80}\n")
+
+        return {
+            "init_time": init_time,
+            "avg_time": avg_time,
+            "min_time": min_time,
+            "max_time": max_time,
+            "total_tokens": total_tokens,
+            "total_prompts": total_prompts,
+            "throughput": throughput,
+            "prompts_per_sec": prompts_per_sec,
+            "trial_times": trial_times,
+        }
+
+    finally:
+        # Cleanup
+        if llm is not None:
+            with contextlib.suppress(Exception):
+                llm.shutdown()
+
+
+def main():
+    # Check platform support
+    if not (current_platform.is_cuda() and current_platform.has_device_capability(90)):
+        print("ERROR: Requires CUDA and >= Hopper (SM90)")
+        print(f"Current platform: {current_platform.device_type}")
+        if current_platform.is_cuda():
+            print(f"Device capability: {current_platform.get_device_capability()}")
+        return 1
+
+    # Read configuration from environment
+    model = os.getenv("VLLM_BENCH_MODEL", "Qwen/Qwen3-1.7B")
+    tp_size = int(os.getenv("VLLM_BENCH_TP_SIZE", "1"))
+    max_batch_size = int(os.getenv("VLLM_BENCH_BATCH_SIZE", "128"))
+    num_trials = int(os.getenv("VLLM_BENCH_NUM_TRIALS", "5"))
+    min_prompt = int(os.getenv("VLLM_BENCH_MIN_PROMPT", "1024"))
+    max_prompt = int(os.getenv("VLLM_BENCH_MAX_PROMPT", "2048"))
+    max_tokens = int(os.getenv("VLLM_BENCH_MAX_TOKENS", "128"))
+    temperature = float(os.getenv("VLLM_BENCH_TEMPERATURE", "0.0"))
+    gpu_mem_util = float(os.getenv("VLLM_BENCH_GPU_MEMORY_UTILIZATION", "0.4"))
+    max_model_len = int(os.getenv("VLLM_BENCH_MAX_MODEL_LEN", "5120"))
+    backend = os.getenv("VLLM_BENCH_BACKEND", "FLASH_ATTN")
+
+    print("\n" + "=" * 80)
+    print("VLLM BATCH INVARIANCE BENCHMARK")
+    print("=" * 80)
+    print("\nConfiguration:")
+    print(f"  Model: {model}")
+    print(f"  Tensor Parallel Size: {tp_size}")
+    print(f"  Attention Backend: {backend}")
+    print(f"  Max Batch Size: {max_batch_size}")
+    print(f"  Number of Trials: {num_trials}")
+    print(f"  Prompt Length Range: {min_prompt}-{max_prompt} words")
+    print(f"  Max Tokens to Generate: {max_tokens}")
+    print(f"  Temperature: {temperature}")
+    print(f"  GPU Memory Utilization: {gpu_mem_util}")
+    print(f"  Max Model Length: {max_model_len}")
+    print("=" * 80)
+
+    # Run benchmark WITHOUT batch invariance (baseline)
+    print("\n" + "=" * 80)
+    print("PHASE 1: Running WITHOUT batch invariance (baseline)")
+    print("=" * 80)
+    baseline_results = run_benchmark_with_batch_invariant(
+        model=model,
+        tp_size=tp_size,
+        max_batch_size=max_batch_size,
+        num_trials=num_trials,
+        min_prompt=min_prompt,
+        max_prompt=max_prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        gpu_mem_util=gpu_mem_util,
+        max_model_len=max_model_len,
+        backend=backend,
+        batch_invariant=False,
+    )
+
+    # Run benchmark WITH batch invariance
+    print("\n" + "=" * 80)
+    print("PHASE 2: Running WITH batch invariance")
+    print("=" * 80)
+    batch_inv_results = run_benchmark_with_batch_invariant(
+        model=model,
+        tp_size=tp_size,
+        max_batch_size=max_batch_size,
+        num_trials=num_trials,
+        min_prompt=min_prompt,
+        max_prompt=max_prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        gpu_mem_util=gpu_mem_util,
+        max_model_len=max_model_len,
+        backend=backend,
+        batch_invariant=True,
+    )
+
+    # Compare results
+    print("\n" + "=" * 80)
+    print("COMPARISON: Batch Invariance vs Baseline")
+    print("=" * 80)
+
+    init_overhead_pct = (
+        (batch_inv_results["init_time"] - baseline_results["init_time"])
+        / baseline_results["init_time"]
+        * 100
+    )
+    time_overhead_pct = (
+        (batch_inv_results["avg_time"] - baseline_results["avg_time"])
+        / baseline_results["avg_time"]
+        * 100
+    )
+    throughput_change_pct = (
+        (batch_inv_results["throughput"] - baseline_results["throughput"])
+        / baseline_results["throughput"]
+        * 100
+    )
+
+    print("\nInitialization Time:")
+    print(f"  Baseline:         {baseline_results['init_time']:.2f}s")
+    print(f"  Batch Invariant:  {batch_inv_results['init_time']:.2f}s")
+    print(f"  Overhead:         {init_overhead_pct:+.2f}%")
+
+    print("\nAverage Trial Time:")
+    print(f"  Baseline:         {baseline_results['avg_time']:.2f}s")
+    print(f"  Batch Invariant:  {batch_inv_results['avg_time']:.2f}s")
+    print(f"  Overhead:         {time_overhead_pct:+.2f}%")
+
+    print("\nThroughput (tokens/s):")
+    print(f"  Baseline:         {baseline_results['throughput']:.2f}")
+    print(f"  Batch Invariant:  {batch_inv_results['throughput']:.2f}")
+    print(f"  Change:           {throughput_change_pct:+.2f}%")
+
+    print("\nPrompts/s:")
+    print(f"  Baseline:         {baseline_results['prompts_per_sec']:.2f}")
+    print(f"  Batch Invariant:  {batch_inv_results['prompts_per_sec']:.2f}")
+
+    print("\n" + "=" * 80)
+    print("SUMMARY")
+    print("=" * 80)
+    if time_overhead_pct > 0:
+        print(
+            f"Batch invariance mode adds approximately {time_overhead_pct:.1f}% "
+            "overhead"
+        )
+    else:
+        print(
+            f"Batch invariance mode is approximately {-time_overhead_pct:.1f}% "
+            "faster (unexpected!)"
+        )
+
+    if abs(throughput_change_pct) < 1.0:
+        print("Throughput difference is negligible (< 1%)")
+    elif throughput_change_pct < 0:
+        print(
+            f"Throughput decreased by {-throughput_change_pct:.1f}% "
+            "with batch invariance"
+        )
+    else:
+        print(
+            f"Throughput increased by {throughput_change_pct:.1f}% "
+            "with batch invariance (unexpected!)"
+        )
+
+    print("=" * 80 + "\n")
+
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -108,7 +108,10 @@ def benchmark_batched_propose(args):
        device_config=DeviceConfig(device=current_platform.device_type),
        parallel_config=ParallelConfig(),
        load_config=LoadConfig(),
-        scheduler_config=SchedulerConfig(),
+        scheduler_config=SchedulerConfig(
+            max_model_len=model_config.max_model_len,
+            is_encoder_decoder=model_config.is_encoder_decoder,
+        ),
    )

    # monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -40,7 +40,7 @@ from vllm.engine.arg_utils import EngineArgs
 from vllm.utils.argparse_utils import FlexibleArgumentParser

 try:
-    from vllm.transformers_utils.tokenizer import get_tokenizer
+    from vllm.tokenizers import get_tokenizer
 except ImportError:
    from backend_request_func import get_tokenizer

@@ -69,7 +69,7 @@ def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]:

    # Remove the special tokens.
    return random.choices(
-        [v for k, v in vocab.items() if k not in all_special_ids],
+        [v for v in vocab.values() if v not in all_special_ids],
        k=length,
    )

--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -46,7 +46,7 @@ from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase

 try:
-    from vllm.transformers_utils.tokenizer import get_tokenizer
+    from vllm.tokenizers import get_tokenizer
 except ImportError:
    from backend_request_func import get_tokenizer

--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -5,11 +5,12 @@ import argparse
 import asyncio
 import logging
 import os
+import time
+import uuid
+from urllib.parse import urlparse

 import aiohttp
 from quart import Quart, Response, make_response, request
-from rate_limiter import RateLimiter
-from request_queue import RequestQueue

 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -24,26 +25,8 @@ def parse_args():
    parser.add_argument(
        "--timeout",
        type=float,
-        default=300,
-        help="Timeout for backend service requests in seconds (default: 300)",
-    )
-    parser.add_argument(
-        "--max-concurrent",
-        type=int,
-        default=100,
-        help="Maximum concurrent requests to backend services (default: 100)",
-    )
-    parser.add_argument(
-        "--queue-size",
-        type=int,
-        default=500,
-        help="Maximum number of requests in the queue (default: 500)",
-    )
-    parser.add_argument(
-        "--rate-limit",
-        type=int,
-        default=40,
-        help="Maximum requests per second (default: 40)",
+        default=6 * 60 * 60,
+        help="Timeout for backend service requests in seconds (default: 21600)",
    )
    parser.add_argument(
        "--port",
@@ -54,14 +37,32 @@ def parse_args():
    parser.add_argument(
        "--prefill-url",
        type=str,
-        default="http://localhost:8100/v1/completions",
-        help="Prefill service endpoint URL",
+        default="http://localhost:8100",
+        help="Prefill service base URL (protocol + host[:port])",
    )
    parser.add_argument(
        "--decode-url",
        type=str,
-        default="http://localhost:8200/v1/completions",
-        help="Decode service endpoint URL",
+        default="http://localhost:8200",
+        help="Decode service base URL (protocol + host[:port])",
+    )
+    parser.add_argument(
+        "--kv-host",
+        type=str,
+        default="localhost",
+        help="Hostname or IP used by KV transfer (default: localhost)",
+    )
+    parser.add_argument(
+        "--prefill-kv-port",
+        type=int,
+        default=14579,
+        help="Prefill KV port (default: 14579)",
+    )
+    parser.add_argument(
+        "--decode-kv-port",
+        type=int,
+        default=14580,
+        help="Decode KV port (default: 14580)",
    )

    return parser.parse_args()
@@ -73,70 +74,129 @@ def main():

    # Initialize configuration using command line parameters
    AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout)
-    MAX_CONCURRENT_REQUESTS = args.max_concurrent
-    REQUEST_QUEUE_SIZE = args.queue_size
-    RATE_LIMIT = args.rate_limit
    PREFILL_SERVICE_URL = args.prefill_url
    DECODE_SERVICE_URL = args.decode_url
    PORT = args.port

+    PREFILL_KV_ADDR = f"{args.kv_host}:{args.prefill_kv_port}"
+    DECODE_KV_ADDR = f"{args.kv_host}:{args.decode_kv_port}"
+
+    logger.info(
+        "Proxy resolved KV addresses -> prefill: %s, decode: %s",
+        PREFILL_KV_ADDR,
+        DECODE_KV_ADDR,
+    )
+
    app = Quart(__name__)

-    # Initialize the rate limiter and request queue
-    rate_limiter = RateLimiter(RATE_LIMIT)
-    request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE)
-
-    # Attach the configuration object to the application instance
+    # Attach the configuration object to the application instance so helper
+    # coroutines can read the resolved backend URLs and timeouts without using
+    # globals.
    app.config.update(
        {
            "AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT,
-            "rate_limiter": rate_limiter,
-            "request_queue": request_queue,
            "PREFILL_SERVICE_URL": PREFILL_SERVICE_URL,
            "DECODE_SERVICE_URL": DECODE_SERVICE_URL,
+            "PREFILL_KV_ADDR": PREFILL_KV_ADDR,
+            "DECODE_KV_ADDR": DECODE_KV_ADDR,
        }
    )

-    # Start queue processing on app startup
-    @app.before_serving
-    async def startup():
-        """Start request processing task when app starts serving"""
-        asyncio.create_task(request_queue.process())
+    def _normalize_base_url(url: str) -> str:
+        """Remove any trailing slash so path joins behave predictably."""
+        return url.rstrip("/")

-    async def forward_request(url, data):
-        """Forward request to backend service with rate limiting and error handling"""
-        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+    def _get_host_port(url: str) -> str:
+        """Return the hostname:port portion for logging and KV headers."""
+        parsed = urlparse(url)
+        host = parsed.hostname or "localhost"
+        port = parsed.port
+        if port is None:
+            port = 80 if parsed.scheme == "http" else 443
+        return f"{host}:{port}"

-        # Use rate limiter as context manager
-        async with (
-            rate_limiter,
-            aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
-        ):
-            try:
-                async with session.post(
-                    url=url, json=data, headers=headers
-                ) as response:
-                    if response.status == 200:
-                        # Stream response chunks
-                        async for chunk_bytes in response.content.iter_chunked(1024):
-                            yield chunk_bytes
-                    else:
-                        # Handle backend service errors
-                        error_text = await response.text()
-                        logger.error(
-                            "Backend service error: %s - %s",
-                            response.status,
-                            error_text,
-                        )
-                        yield b'{"error": "Backend service error"}'
-            except aiohttp.ClientError as e:
-                # Handle connection errors
-                logger.error("Connection error to %s: %s", url, str(e))
-                yield b'{"error": "Service unavailable"}'
-            except asyncio.TimeoutError:
-                # Handle timeout errors
-                logger.error("Timeout connecting to %s", url)
-                yield b'{"error": "Service timeout"}'
+    PREFILL_BASE = _normalize_base_url(PREFILL_SERVICE_URL)
+    DECODE_BASE = _normalize_base_url(DECODE_SERVICE_URL)
+    KV_TARGET = _get_host_port(DECODE_SERVICE_URL)
+
+    def _build_headers(request_id: str) -> dict[str, str]:
+        """Construct the headers expected by vLLM's P2P disagg connector."""
+        headers: dict[str, str] = {"X-Request-Id": request_id, "X-KV-Target": KV_TARGET}
+        api_key = os.environ.get("OPENAI_API_KEY")
+        if api_key:
+            headers["Authorization"] = f"Bearer {api_key}"
+        return headers
+
+    async def _run_prefill(
+        request_path: str,
+        payload: dict,
+        headers: dict[str, str],
+        request_id: str,
+    ):
+        url = f"{PREFILL_BASE}{request_path}"
+        start_ts = time.perf_counter()
+        logger.info("[prefill] start request_id=%s url=%s", request_id, url)
+        try:
+            async with (
+                aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
+                session.post(url=url, json=payload, headers=headers) as resp,
+            ):
+                if resp.status != 200:
+                    error_text = await resp.text()
+                    raise RuntimeError(
+                        f"Prefill backend error {resp.status}: {error_text}"
+                    )
+                await resp.read()
+                logger.info(
+                    "[prefill] done request_id=%s status=%s elapsed=%.2fs",
+                    request_id,
+                    resp.status,
+                    time.perf_counter() - start_ts,
+                )
+        except asyncio.TimeoutError as exc:
+            raise RuntimeError(f"Prefill service timeout at {url}") from exc
+        except aiohttp.ClientError as exc:
+            raise RuntimeError(f"Prefill service unavailable at {url}") from exc
+
+    async def _stream_decode(
+        request_path: str,
+        payload: dict,
+        headers: dict[str, str],
+        request_id: str,
+    ):
+        url = f"{DECODE_BASE}{request_path}"
+        # Stream tokens from the decode service once the prefill stage has
+        # materialized KV caches on the target workers.
+        logger.info("[decode] start request_id=%s url=%s", request_id, url)
+        try:
+            async with (
+                aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
+                session.post(url=url, json=payload, headers=headers) as resp,
+            ):
+                if resp.status != 200:
+                    error_text = await resp.text()
+                    logger.error(
+                        "Decode backend error %s - %s", resp.status, error_text
+                    )
+                    err_msg = (
+                        '{"error": "Decode backend error ' + str(resp.status) + '"}'
+                    )
+                    yield err_msg.encode()
+                    return
+                logger.info(
+                    "[decode] streaming response request_id=%s status=%s",
+                    request_id,
+                    resp.status,
+                )
+                async for chunk_bytes in resp.content.iter_chunked(1024):
+                    yield chunk_bytes
+                logger.info("[decode] finished streaming request_id=%s", request_id)
+        except asyncio.TimeoutError:
+            logger.error("Decode service timeout at %s", url)
+            yield b'{"error": "Decode service timeout"}'
+        except aiohttp.ClientError as exc:
+            logger.error("Decode service error at %s: %s", url, exc)
+            yield b'{"error": "Decode service unavailable"}'

    async def process_request():
        """Process a single request through prefill and decode stages"""
@@ -146,13 +206,27 @@ def main():
            # Create prefill request (max_tokens=1)
            prefill_request = original_request_data.copy()
            prefill_request["max_tokens"] = 1
+            if "max_completion_tokens" in prefill_request:
+                prefill_request["max_completion_tokens"] = 1

            # Execute prefill stage
-            async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request):
-                continue
+            # The request id encodes both KV socket addresses so the backend can
+            # shuttle tensors directly via NCCL once the prefill response
+            # completes.
+            request_id = (
+                f"___prefill_addr_{PREFILL_KV_ADDR}___decode_addr_"
+                f"{DECODE_KV_ADDR}_{uuid.uuid4().hex}"
+            )
+
+            headers = _build_headers(request_id)
+            await _run_prefill(request.path, prefill_request, headers, request_id)

            # Execute decode stage and stream response
-            generator = forward_request(DECODE_SERVICE_URL, original_request_data)
+            # Pass the unmodified user request so the decode phase can continue
+            # sampling with the already-populated KV cache.
+            generator = _stream_decode(
+                request.path, original_request_data, headers, request_id
+            )
            response = await make_response(generator)
            response.timeout = None  # Disable timeout for streaming response
            return response
@@ -168,23 +242,10 @@ def main():
    @app.route("/v1/completions", methods=["POST"])
    async def handle_request():
        """Handle incoming API requests with concurrency and rate limiting"""
-        # Create task for request processing
-        task = asyncio.create_task(process_request())
-
-        # Enqueue request or reject if queue is full
-        if not await request_queue.enqueue(task):
-            return Response(
-                response=b'{"error": "Server busy, try again later"}',
-                status=503,
-                content_type="application/json",
-            )
-
        try:
-            # Return the response from the processing task
-            return await task
+            return await process_request()
        except asyncio.CancelledError:
-            # Handle task cancellation (timeout or queue full)
-            logger.warning("Request cancelled due to timeout or queue full")
+            logger.warning("Request cancelled")
            return Response(
                response=b'{"error": "Request cancelled"}',
                status=503,
--- a/benchmarks/kernels/bench_block_fp8_gemm.py
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@@ -1,10 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+import os
+
+# Disable DeepGEMM for this benchmark to use CUTLASS
+os.environ["VLLM_USE_DEEP_GEMM"] = "0"
+
 import torch

 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    apply_w8a8_block_fp8_linear,
+    W8A8BlockFp8LinearOp,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape,
 )
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
    CUTLASS_BLOCK_FP8_SUPPORTED,
@@ -39,13 +47,14 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
    fp8_info = torch.finfo(torch.float8_e4m3fn)
    fp8_max, fp8_min = fp8_info.max, fp8_info.min

-    # Create random FP8 tensors
+    # Create random input tensor (bfloat16, will be quantized by W8A8BlockFp8LinearOp)
    A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max

+    # Create quantized weight tensor
    B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
    B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)

-    # Create scales
+    # Create weight scales
    block_n, block_k = block_size[0], block_size[1]
    n_tiles = (N + block_n - 1) // block_n
    k_tiles = (K + block_k - 1) // block_k
@@ -55,19 +64,25 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
        * factor_for_scale
    )

-    # SM90 CUTLASS requires row-major format for scales
-    if use_cutlass and current_platform.is_device_capability(90):
-        Bs = Bs.T.contiguous()
+    # Create W8A8BlockFp8LinearOp instance
+    weight_group_shape = GroupShape(block_n, block_k)
+    act_quant_group_shape = GroupShape(1, block_k)  # Per-token, per-group quantization
+
+    linear_op = W8A8BlockFp8LinearOp(
+        weight_group_shape=weight_group_shape,
+        act_quant_group_shape=act_quant_group_shape,
+        cutlass_block_fp8_supported=use_cutlass,
+        use_aiter_and_is_supported=False,
+    )

    def run():
-        if use_cutlass:
-            return apply_w8a8_block_fp8_linear(
-                A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True
-            )
-        else:
-            return apply_w8a8_block_fp8_linear(
-                A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False
-            )
+        return linear_op.apply(
+            input=A_ref,
+            weight=B,
+            weight_scale=Bs,
+            input_scale=None,
+            bias=None,
+        )

    return run

--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -255,8 +255,8 @@ def bench_run(
        torch.cuda.synchronize()

        # Timing
-        start_event = torch.cuda.Event(enable_timing=True)
-        end_event = torch.cuda.Event(enable_timing=True)
+        start_event = torch.Event(enable_timing=True)
+        end_event = torch.Event(enable_timing=True)

        latencies = []
        for _ in range(num_iters):
--- a/benchmarks/kernels/benchmark_fused_collective.py
+++ b/benchmarks/kernels/benchmark_fused_collective.py
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -237,6 +237,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
            b_q_weight=w_q,
            b_bias=None,
            b_scales=w_s,
+            a_scales=None,
            global_scale=None,
            b_zeros=w_zp,
            g_idx=g_idx,
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -263,7 +263,7 @@ def bench_run(

    results.append(
        benchmark.Timer(
-            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
+            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
@@ -273,7 +273,7 @@ def bench_run(

    results.append(
        benchmark.Timer(
-            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
+            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -185,8 +185,8 @@ def benchmark_config(
        graph.replay()
    torch.cuda.synchronize()

-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
+    start_event = torch.Event(enable_timing=True)
+    end_event = torch.Event(enable_timing=True)

    latencies: list[float] = []
    for i in range(num_iters):
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -105,8 +105,8 @@ def benchmark_permute(
        graph.replay()
    torch.cuda.synchronize()

-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
+    start_event = torch.Event(enable_timing=True)
+    end_event = torch.Event(enable_timing=True)

    latencies: list[float] = []
    for i in range(num_iters):
@@ -241,8 +241,8 @@ def benchmark_unpermute(
        graph.replay()
    torch.cuda.synchronize()

-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
+    start_event = torch.Event(enable_timing=True)
+    end_event = torch.Event(enable_timing=True)

    latencies: list[float] = []
    for i in range(num_iters):
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -6,7 +6,7 @@
 #
 # The CSV file (named with current date/time) contains these columns:
 # model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position,
-# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99,
+# is_neox_style, rope_parameters, dtype, torch_mean, torch_median, torch_p99,
 # torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max,
 # speedup
 #
@@ -86,9 +86,8 @@ def benchmark_mrope(
    num_heads: int,
    num_kv_heads: int,
    max_position: int = 8192,
-    rope_theta: float = 10000,
    is_neox_style: bool = True,
-    rope_scaling: dict[str, Any] = None,
+    rope_parameters: dict[str, Any] | None = None,
    dtype: torch.dtype = torch.bfloat16,
    seed: int = 0,
    warmup_iter: int = 10,
@@ -102,9 +101,8 @@ def benchmark_mrope(
        head_size=head_dim,
        rotary_dim=head_dim,
        max_position=max_position,
-        base=rope_theta,
        is_neox_style=is_neox_style,
-        rope_scaling=rope_scaling,
+        rope_parameters=rope_parameters,
        dtype=dtype,
    ).to(device=device)

@@ -203,9 +201,8 @@ def benchmark_mrope(
            num_kv_heads,
            head_dim,
            max_position,
-            rope_theta,
            is_neox_style,
-            str(rope_scaling),
+            str(rope_parameters),
            str(dtype).split(".")[-1],
            torch_stats["mean"],
            torch_stats["median"],
@@ -255,9 +252,8 @@ if __name__ == "__main__":
            "num_kv_heads",
            "head_dim",
            "max_position",
-            "rope_theta",
            "is_neox_style",
-            "rope_scaling",
+            "rope_parameters",
            "dtype",
            "torch_mean",
            "torch_median",
@@ -303,7 +299,7 @@ if __name__ == "__main__":
                q_size = num_heads * head_dim
                kv_size = num_kv_heads * head_dim
                is_neox_style = True
-                rope_theta = config.rope_theta
+                rope_parameters = config.rope_parameters
                max_position = config.max_position_embeddings

                for num_tokens in num_tokens_list:
@@ -315,9 +311,8 @@ if __name__ == "__main__":
                        num_heads=num_heads,
                        num_kv_heads=num_kv_heads,
                        max_position=max_position,
-                        rope_theta=rope_theta,
                        is_neox_style=is_neox_style,
-                        rope_scaling=config.rope_scaling,
+                        rope_parameters=rope_parameters,
                        dtype=getattr(torch, args.dtype),
                        seed=args.seed,
                        warmup_iter=args.warmup_iter,
--- a/benchmarks/kernels/benchmark_per_token_group_quant.py
+++ b/benchmarks/kernels/benchmark_per_token_group_quant.py
@@ -30,8 +30,8 @@ def _time_cuda(
        fn()
    torch.cuda.synchronize()

-    start = torch.cuda.Event(enable_timing=True)
-    end = torch.cuda.Event(enable_timing=True)
+    start = torch.Event(enable_timing=True)
+    end = torch.Event(enable_timing=True)

    start.record()
    for _ in range(bench_iters):
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -1,97 +1,76 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from itertools import accumulate
+import itertools

-import nvtx
 import torch

-from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope
-from vllm.platforms import current_platform
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.triton_utils import triton
 from vllm.utils.argparse_utils import FlexibleArgumentParser

+batch_size_range = [2**i for i in range(0, 8, 2)]
+seq_len_range = [2**i for i in range(6, 10, 1)]
+num_heads_range = [32, 48]
+configs = list(itertools.product(batch_size_range, seq_len_range, num_heads_range))

-def benchmark_rope_kernels_multi_lora(
-    is_neox_style: bool,
-    batch_size: int,
-    seq_len: int,
-    num_heads: int,
-    head_size: int,
-    rotary_dim: int | None,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-    max_position: int = 8192,
-    base: float = 10000,
-) -> None:
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    if rotary_dim is None:
-        rotary_dim = head_size
-    # silulating serving 4 LoRAs
-    scaling_factors = [1, 2, 4, 8]
-    # batched RoPE can take multiple scaling factors
-    batched_rope = get_rope(
-        head_size,
-        rotary_dim,
-        max_position,
-        base,
-        is_neox_style,
-        {"rope_type": "linear", "factor": tuple(scaling_factors)},
-    )
-    # non-batched RoPE takes only one scaling factor, we create multiple
-    # instances to simulate the same behavior
-    non_batched_ropes: list[RotaryEmbedding] = []
-    for scaling_factor in scaling_factors:
-        non_batched_ropes.append(
-            get_rope(
-                head_size,
-                rotary_dim,
-                max_position,
-                base,
-                is_neox_style,
-                {"rope_type": "linear", "factor": (scaling_factor,)},
-            )
-        )

-    positions = torch.randint(0, max_position, (batch_size, seq_len))
-    query = torch.randn(batch_size, seq_len, num_heads * head_size, dtype=dtype)
-    key = torch.randn_like(query)
-
-    # create query offsets for batched RoPE, we concat multiple kv cache
-    # together and each query needs to find the right kv cache of its type
-    offset_map = torch.tensor(
-        list(
-            accumulate(
-                [0]
-                + [
-                    max_position * scaling_factor * 2
-                    for scaling_factor in scaling_factors[:-1]
-                ]
-            )
+def get_benchmark(head_size, rotary_dim, is_neox_style, device):
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["batch_size", "seq_len", "num_heads"],
+            x_vals=[list(_) for _ in configs],
+            line_arg="provider",
+            line_vals=["torch", "flashinfer", "vllm"],
+            line_names=["PyTorch", "FlashInfer", "vLLM"],
+            styles=[("blue", "-"), ("green", "-"), ("red", "-")],
+            ylabel="us",
+            plot_name=f"rope-perf{'-neox-style' if is_neox_style else ''}",
+            args={},
        )
    )
-    query_types = torch.randint(
-        0, len(scaling_factors), (batch_size, seq_len), device=device
-    )
-    # map query types to offsets
-    query_offsets = offset_map[query_types]
-    # the kernel takes flattened offsets
-    flatten_offsets = query_offsets.flatten()
+    def benchmark(batch_size, seq_len, num_heads, provider):
+        dtype = torch.bfloat16
+        max_position = 8192
+        base = 10000
+        rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
+        rope = rope.to(dtype=dtype, device=device)
+        cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device)

-    # batched queries of the same type together for non-batched RoPE
-    queries = [query[query_types == i] for i in range(len(scaling_factors))]
-    keys = [key[query_types == i] for i in range(len(scaling_factors))]
-    packed_qkr = zip(queries, keys, non_batched_ropes)
-    # synchronize before start timing
-    torch.cuda.synchronize()
-    with nvtx.annotate("non-batched", color="yellow"):
-        for q, k, r in packed_qkr:
-            r.forward(positions, q, k)
-    torch.cuda.synchronize()
-    with nvtx.annotate("batched", color="green"):
-        batched_rope.forward(positions, query, key, flatten_offsets)
-    torch.cuda.synchronize()
+        positions = torch.randint(0, max_position, (batch_size, seq_len), device=device)
+        query = torch.randn(
+            (batch_size, seq_len, num_heads * head_size), dtype=dtype, device=device
+        )
+        key = torch.randn_like(query)
+
+        quantiles = [0.5, 0.2, 0.8]
+
+        if provider == "torch":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rope.forward_native(positions, query.clone(), key.clone()),
+                quantiles=quantiles,
+            )
+        elif provider == "flashinfer":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: torch.ops.vllm.flashinfer_rotary_embedding(
+                    positions,
+                    query.clone(),
+                    key.clone(),
+                    head_size,
+                    cos_sin_cache,
+                    is_neox_style,
+                ),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rope.forward_cuda(positions, query.clone(), key.clone()),
+                quantiles=quantiles,
+            )
+
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+    return benchmark


 if __name__ == "__main__":
@@ -116,17 +95,12 @@ if __name__ == "__main__":
    parser.add_argument(
        "--device", type=str, choices=["cuda:0", "cuda:1"], default="cuda:0"
    )
+    parser.add_argument("--save-path", type=str, default="./configs/rope/")
    args = parser.parse_args()
-    print(args)

-    benchmark_rope_kernels_multi_lora(
-        is_neox_style=args.is_neox_style,
-        batch_size=args.batch_size,
-        seq_len=args.seq_len,
-        num_heads=args.num_heads,
-        head_size=args.head_size,
-        rotary_dim=args.rotary_dim,
-        dtype=getattr(torch, args.dtype),
-        seed=args.seed,
-        device=args.device,
+    # Get the benchmark function
+    benchmark = get_benchmark(
+        args.head_size, args.rotary_dim, args.is_neox_style, args.device
    )
+    # Run performance benchmark
+    benchmark.run(print_data=True, save_path=args.save_path)
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -253,8 +253,8 @@ def benchmark(
        )
    torch.cuda.synchronize()

-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
+    start_event = torch.Event(enable_timing=True)
+    end_event = torch.Event(enable_timing=True)

    # Benchmark
    latencies: list[float] = []
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -127,8 +127,8 @@ def benchmark_decode(

    def time_fn(fn, warmup=10, trials=20):
        torch.cuda.synchronize()
-        start = torch.cuda.Event(enable_timing=True)
-        end = torch.cuda.Event(enable_timing=True)
+        start = torch.Event(enable_timing=True)
+        end = torch.Event(enable_timing=True)
        times = []
        for i in range(warmup):
            fn()
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -139,8 +139,8 @@ def benchmark_prefill(

    def time_fn(fn, warmup=10, trials=20):
        torch.cuda.synchronize()
-        start = torch.cuda.Event(enable_timing=True)
-        end = torch.cuda.Event(enable_timing=True)
+        start = torch.Event(enable_timing=True)
+        end = torch.Event(enable_timing=True)
        times = []
        for i in range(warmup):
            fn()
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -183,8 +183,8 @@ def benchmark_config(
        run()
    torch.cuda.synchronize()

-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
+    start_event = torch.Event(enable_timing=True)
+    end_event = torch.Event(enable_timing=True)

    latencies: list[float] = []
    for i in range(num_iters):
--- a/benchmarks/kernels/deepgemm/README.md
+++ b/benchmarks/kernels/deepgemm/README.md
@@ -2,7 +2,7 @@

 This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.

-Currently this just includes dense GEMMs and only works on Hopper GPUs.
+Currently, this just includes dense GEMMs and only works on Hopper GPUs.

 ## Setup

--- a/benchmarks/multi_turn/README.md
+++ b/benchmarks/multi_turn/README.md
@@ -55,6 +55,10 @@ output_num_chunks  166.0    99.01   11.80    79.00    90.00    98.00   108.75
 ----------------------------------------------------------------------------------------------------
 ```

+If you run with `--warmup-step`, the summary will also include `warmup_runtime_sec`
+and `total_runtime_incl_warmup_sec` (while `runtime_sec` continues to reflect the
+benchmark-only runtime so the reported throughput stays comparable).
+
 ### JSON configuration file for synthetic conversations generation

 The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>
--- a/benchmarks/multi_turn/bench_dataset.py
+++ b/benchmarks/multi_turn/bench_dataset.py
@@ -11,6 +11,7 @@ from bench_utils import (
    Color,
    logger,
 )
+from tqdm import tqdm
 from transformers import AutoTokenizer  # type: ignore

 # Conversation ID is a string (e.g: "UzTK34D")
@@ -417,6 +418,10 @@ def generate_conversations(
            data = file.read()
            tokens_in_file = tokenizer.encode(data, add_special_tokens=False)
            list_of_tokens.extend(tokens_in_file)
+        logger.info(
+            f"Loaded {len(tokens_in_file)} tokens from file {filename}, "
+            f"total tokens so far: {len(list_of_tokens)}"
+        )

    conversations: ConversationsMap = {}
    conv_id = 0
@@ -449,18 +454,25 @@ def generate_conversations(
        )
        base_offset += common_prefix_tokens

-    for conv_id in range(args.num_conversations):
+    for conv_id in tqdm(
+        range(args.num_conversations),
+        total=args.num_conversations,
+        desc="Generating conversations",
+        unit="conv",
+    ):
        # Generate a single conversation
        messages: MessagesList = []

        nturns = turn_count[conv_id]

        # User prompt token count per turn (with lower limit)
-        input_token_count: np.ndarray = args.input_num_tokens.sample(nturns)
+        input_token_count: np.ndarray = args.input_num_tokens.sample(nturns).astype(int)
        input_token_count = np.maximum(input_token_count, base_prompt_token_count)

        # Assistant answer token count per turn (with lower limit)
-        output_token_count: np.ndarray = args.output_num_tokens.sample(nturns)
+        output_token_count: np.ndarray = args.output_num_tokens.sample(nturns).astype(
+            int
+        )
        output_token_count = np.maximum(output_token_count, 1)

        user_turn = True
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -55,6 +55,7 @@ class ClientArgs(NamedTuple):
    verify_output: bool
    conversation_sampling: ConversationSampling
    request_rate: float
+    max_retries: int


 class RequestArgs(NamedTuple):
@@ -63,6 +64,7 @@ class RequestArgs(NamedTuple):
    stream: bool
    limit_min_tokens: int  # Use negative value for no limit
    limit_max_tokens: int  # Use negative value for no limit
+    timeout_sec: int


 class BenchmarkArgs(NamedTuple):
@@ -214,6 +216,7 @@ async def send_request(
    stream: bool = True,
    min_tokens: int | None = None,
    max_tokens: int | None = None,
+    timeout_sec: int = 120,
 ) -> ServerResponse:
    payload = {
        "model": model,
@@ -235,10 +238,16 @@ async def send_request(
    headers = {"Content-Type": "application/json"}

    # Calculate the timeout for the request
-    timeout_sec = 120
    if max_tokens is not None:
        # Assume TPOT of 200ms and use max_tokens to determine timeout
-        timeout_sec = max(timeout_sec, int(max_tokens * 0.2))
+        token_based_timeout = int(max_tokens * 0.2)
+        if token_based_timeout > timeout_sec:
+            timeout_sec = token_based_timeout
+            logger.info(
+                "Using timeout of %ds based on max_tokens %d",
+                timeout_sec,
+                max_tokens,
+            )
    timeout = aiohttp.ClientTimeout(total=timeout_sec)

    valid_response = True
@@ -409,6 +418,7 @@ async def send_turn(
        req_args.stream,
        min_tokens,
        max_tokens,
+        req_args.timeout_sec,
    )

    if response.valid is False:
@@ -518,6 +528,25 @@ async def poisson_sleep(request_rate: float, verbose: bool = False) -> None:
    await asyncio.sleep(interval)


+async def exponential_backoff_sleep(
+    attempt_cnt: int,
+    base_rate: float = 1.0,
+    backoff_factor: float = 2.0,
+    jitter_fraction: float = 0.10,
+    verbose: bool = False,
+) -> None:
+    # Sleep with exponential backoff and jitter after a failed request.
+    backoff_delay = base_rate * (backoff_factor**attempt_cnt)
+    jittered_delay = backoff_delay * (
+        1 + np.random.uniform(-jitter_fraction, jitter_fraction)
+    )
+
+    if verbose:
+        logger.info(f"Backoff for {jittered_delay:.3f} seconds...")
+
+    await asyncio.sleep(jittered_delay)
+
+
 async def client_main(
    args: ClientArgs,
    req_args: RequestArgs,
@@ -532,8 +561,11 @@ async def client_main(
        f"{Color.CYAN}Started client {client_id}: max_num_requests={args.max_num_requests}, max_active_conversations={args.max_active_conversations}{Color.RESET}"  # noqa: E501
    )

-    random.seed(args.seed)
-    np.random.seed(args.seed)
+    # Set unique seed per client (each client runs in its own process)
+    # Add 1 to ensure no client uses the same seed as the main process
+    client_seed = args.seed + client_id + 1
+    random.seed(client_seed)
+    np.random.seed(client_seed)

    # Active conversations
    active_convs: ConversationsMap = {}
@@ -646,49 +678,62 @@ async def client_main(
                )
                time_of_last_turn[conv_id] = curr_time_sec

-            success = True
-            try:
-                result = await send_turn(
-                    session,
-                    client_id,
-                    conv_id,
-                    messages,
-                    current_turn,
-                    tokenizer,
-                    req_args,
-                    args.print_content,
-                    args.verify_output,
-                )
-                if result is not None:
-                    result_queue.put(result)
-                else:
-                    # None means that the request failed,
-                    # and should not be added to the statistics.
-                    success = False
-                    num_failures += 1
-
-                    logger.warning(
-                        f"{Color.YELLOW}Client {client_id} - Request rejected during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
+            success = False
+            for attempt_cnt in range(args.max_retries + 1):
+                try:
+                    exception = False
+                    result = await send_turn(
+                        session,
+                        client_id,
+                        conv_id,
+                        messages,
+                        current_turn,
+                        tokenizer,
+                        req_args,
+                        args.print_content,
+                        args.verify_output,
+                    )
+                    if result is not None:
+                        result_queue.put(result)
+                        success = True
+                        break
+                    else:
+                        logger.warning(
+                            f"{Color.YELLOW}Client {client_id} - Request rejected during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
+                        )
+                except asyncio.exceptions.TimeoutError:
+                    exception = True
+                    logger.error(
+                        "%sClient %d - Timeout during conversation ID %s (turn: %d). "
+                        "Base timeout is %ss (set with --request-timeout-sec), but the "
+                        "effective timeout may be longer based on max_tokens. If this "
+                        "is unexpected, consider increasing the timeout or checking "
+                        "model performance.%s",
+                        Color.RED,
+                        client_id,
+                        conv_id,
+                        current_turn,
+                        req_args.timeout_sec,
+                        Color.RESET,
+                    )
+                except Exception:
+                    exception = True
+                    logger.exception(
+                        f"{Color.RED}Client {client_id} - Exception during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
                    )

-                    # Remove the conversation (should not be used again)
-                    active_convs.pop(conv_id)
+                # Sleep before retry if not last attempt
+                if not success and attempt_cnt < args.max_retries:
+                    await exponential_backoff_sleep(attempt_cnt, verbose=args.verbose)

-            except asyncio.exceptions.TimeoutError:
+            if not success:
                num_failures += 1
-                logger.exception(
-                    f"{Color.RED}Client {client_id} - Timeout during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
-                )
-                break  # Exit gracefully instead of raising an error
+                # Remove the conversation (should not be used again)
+                active_convs.pop(conv_id)
+                if exception:
+                    break  # Exit gracefully instead of raising an error

-            except Exception:
-                num_failures += 1
-                logger.exception(
-                    f"{Color.RED}Client {client_id} - Exception during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
-                )
-                break  # Exit gracefully instead of raising an error
-
-            if success:
+            else:
                num_successes += 1

                # Update the turns counter to include the LLM response
@@ -803,6 +848,7 @@ def get_client_config(
        verify_output=args.verify_output,
        conversation_sampling=args.conversation_sampling,
        request_rate=args.request_rate,
+        max_retries=args.max_retries,
    )

    if args.limit_min_tokens > 0 or args.limit_max_tokens > 0:
@@ -815,6 +861,9 @@ def get_client_config(
                "Invalid min/max tokens limits (min should not be larger than max)"
            )

+    if args.request_timeout_sec <= 0:
+        raise ValueError("Request timeout must be a positive number")
+
    # Arguments for API requests
    chat_url = f"{args.url}/v1/chat/completions"
    model_name = args.served_model_name if args.served_model_name else args.model
@@ -825,6 +874,7 @@ def get_client_config(
        stream=not args.no_stream,
        limit_min_tokens=args.limit_min_tokens,
        limit_max_tokens=args.limit_max_tokens,
+        timeout_sec=args.request_timeout_sec,
    )

    return client_args, req_args
@@ -968,7 +1018,7 @@ async def main_mp(
            f"(is alive: {client.is_alive()}){Color.RESET}"
        )

-        client.join(timeout=120)
+        client.join(timeout=req_args.timeout_sec + 1)

        if client.is_alive():
            logger.warning(
@@ -1026,6 +1076,7 @@ def process_statistics(
    verbose: bool,
    gen_conv_args: GenConvArgs | None = None,
    excel_output: bool = False,
+    warmup_runtime_sec: float | None = None,
 ) -> None:
    if len(client_metrics) == 0:
        logger.info("No samples to process")
@@ -1119,8 +1170,13 @@ def process_statistics(
        # Convert milliseconds to seconds
        runtime_sec = runtime_sec / 1000.0
        requests_per_sec = float(len(df)) / runtime_sec
-
-        params = {"runtime_sec": runtime_sec, "requests_per_sec": requests_per_sec}
+        params = {
+            "runtime_sec": runtime_sec,
+            "requests_per_sec": requests_per_sec,
+        }
+        if warmup_runtime_sec is not None:
+            params["warmup_runtime_sec"] = warmup_runtime_sec
+            params["total_runtime_incl_warmup_sec"] = runtime_sec + warmup_runtime_sec

        # Generate a summary of relevant metrics (and drop irrelevant data)
        df = df.drop(columns=exclude).describe(percentiles=percentiles).transpose()
@@ -1334,6 +1390,16 @@ async def main() -> None:
        help="Expected request rate (Poisson process) per client in requests/sec."
        "Set to 0 for no delay between requests.",
    )
+    parser.add_argument(
+        "--max-retries",
+        type=int,
+        default=int(os.environ.get("MULTITURN_BENCH_MAX_RETRIES", "0")),
+        help="Maximum number of retry attempts for timed-out requests. "
+        "Default is 0 (no retries). "
+        "Set to higher values to retry failed requests and maintain "
+        "fair workload distribution. "
+        "Can also be set via MULTITURN_BENCH_MAX_RETRIES environment variable.",
+    )
    parser.add_argument(
        "--conversation-sampling",
        type=ConversationSampling,
@@ -1351,6 +1417,13 @@ async def main() -> None:
        action="store_true",
        help="Verify the LLM output (compare to the answers in the input JSON file)",
    )
+    parser.add_argument(
+        "--request-timeout-sec",
+        type=int,
+        default=120,
+        help="Timeout in seconds for each API request (default: 120). "
+        "Automatically increased if max tokens imply longer decoding.",
+    )

    parser.add_argument(
        "--no-stream",
@@ -1426,6 +1499,7 @@ async def main() -> None:
            f"Invalid --warmup-percentage={args.warmup_percentage}"
        ) from None

+    # Set global seeds for main process
    random.seed(args.seed)
    np.random.seed(args.seed)

@@ -1484,6 +1558,8 @@ async def main() -> None:
        url=args.url, num_clients=args.num_clients, early_stop=not args.no_early_stop
    )

+    warmup_runtime_sec: float | None = None
+
    # Warm-up step
    if args.warmup_step:
        # Only send a single user prompt from every conversation.
@@ -1498,26 +1574,56 @@ async def main() -> None:
        # all clients should finish their work before exiting
        warmup_bench_args = bench_args._replace(early_stop=False)

-        logger.info(f"{Color.PURPLE}Warmup start{Color.RESET}")
+        logger.info("%sWarmup start%s", Color.PURPLE, Color.RESET)
+        warmup_start_ns = time.perf_counter_ns()
        conversations, _ = await main_mp(
            warmup_client_args, req_args, warmup_bench_args, tokenizer, conversations
        )
-        logger.info(f"{Color.PURPLE}Warmup done{Color.RESET}")
+        warmup_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - warmup_start_ns)
+        logger.info(
+            "%sWarmup runtime: %.3f sec (%.3f ms)%s",
+            Color.PURPLE,
+            warmup_runtime_sec,
+            warmup_runtime_sec * 1000,
+            Color.RESET,
+        )
+        logger.info("%sWarmup done%s", Color.PURPLE, Color.RESET)

    # Run the benchmark
-    start_time = time.perf_counter_ns()
+    benchmark_start_ns = time.perf_counter_ns()
    client_convs, client_metrics = await main_mp(
        client_args, req_args, bench_args, tokenizer, conversations
    )
-    total_runtime_ms = nanosec_to_millisec(time.perf_counter_ns() - start_time)
+    benchmark_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - benchmark_start_ns)

    # Calculate requests per second
-    total_runtime_sec = total_runtime_ms / 1000.0
-    rps = len(client_metrics) / total_runtime_sec
+    requests_per_sec = len(client_metrics) / benchmark_runtime_sec
+    benchmark_runtime_ms = benchmark_runtime_sec * 1000.0
    logger.info(
-        f"{Color.GREEN}All clients finished, total runtime: {total_runtime_sec:.3f} sec"
-        f" ({total_runtime_ms:.3f} ms), requests per second: {rps:.3f}{Color.RESET}"
+        "%sAll clients finished, benchmark runtime: %.3f sec (%.3f ms), "
+        "requests per second: %.3f%s",
+        Color.GREEN,
+        benchmark_runtime_sec,
+        benchmark_runtime_ms,
+        requests_per_sec,
+        Color.RESET,
    )
+    if warmup_runtime_sec is not None:
+        total_runtime_sec = benchmark_runtime_sec + warmup_runtime_sec
+        logger.info(
+            "%sWarmup runtime: %.3f sec (%.3f ms)%s",
+            Color.GREEN,
+            warmup_runtime_sec,
+            warmup_runtime_sec * 1000,
+            Color.RESET,
+        )
+        logger.info(
+            "%sTotal runtime (including warmup): %.3f sec (%.3f ms)%s",
+            Color.GREEN,
+            total_runtime_sec,
+            total_runtime_sec * 1000,
+            Color.RESET,
+        )

    # Benchmark parameters
    params = {
@@ -1542,6 +1648,7 @@ async def main() -> None:
        verbose=args.verbose,
        gen_conv_args=gen_conv_args,
        excel_output=args.excel_output,
+        warmup_runtime_sec=warmup_runtime_sec,
    )

    if args.output_file is not None:
--- a/benchmarks/multi_turn/requirements.txt
+++ b/benchmarks/multi_turn/requirements.txt
@@ -2,4 +2,5 @@ numpy>=1.24
 pandas>=2.0.0
 aiohttp>=3.10
 transformers>=4.46
-xlsxwriter>=3.2.1
+xlsxwriter>=3.2.1
+tqdm>=4.66
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -15,6 +15,7 @@ endif()
 #
 set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16})
 set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI})
+set(ENABLE_AMXBF16 $ENV{VLLM_CPU_AMXBF16})

 include_directories("${CMAKE_SOURCE_DIR}/csrc")

@@ -140,6 +141,22 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
        set(ENABLE_AVX512VNNI OFF)
        message(WARNING "Disable AVX512-VNNI ISA support, no avx512_vnni found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512VNNI=1.")
    endif()
+
+    find_isa(${CPUINFO} "amx_bf16" AMXBF16_FOUND)
+    if (AMXBF16_FOUND OR ENABLE_AMXBF16)
+        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
+            list(APPEND CXX_COMPILE_FLAGS "-mamx-bf16" "-mamx-tile")
+            set(ENABLE_AMXBF16 ON)
+            add_compile_definitions(-DCPU_CAPABILITY_AMXBF16)
+        else()
+            set(ENABLE_AMXBF16 OFF)
+            message(WARNING "Disable AMX_BF16 ISA support, requires gcc/g++ >= 12.3")
+        endif()
+    else()
+        set(ENABLE_AMXBF16 OFF)
+        message(WARNING "Disable AMX_BF16 ISA support, no amx_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AMXBF16=1.")
+    endif()
    
 elseif (AVX2_FOUND)
    list(APPEND CXX_COMPILE_FLAGS "-mavx2")
@@ -193,7 +210,30 @@ endif()
 if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
    # Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64
    # TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN
+    set(ONEDNN_AARCH64_USE_ACL OFF CACHE BOOL "")
    if(ASIMD_FOUND)
+        # Set number of parallel build processes
+        include(ProcessorCount)
+        ProcessorCount(NPROC)
+        if(NOT NPROC)
+            set(NPROC 4)
+        endif()
+        # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
+        # and create a local shim dir with it
+        vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
+
+        find_library(OPEN_MP
+            NAMES gomp
+            PATHS ${VLLM_TORCH_GOMP_SHIM_DIR}
+            NO_DEFAULT_PATH
+            REQUIRED
+        )
+        # Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch
+        if (OPEN_MP)
+            set(ENV{LD_LIBRARY_PATH} "${VLLM_TORCH_GOMP_SHIM_DIR}:$ENV{LD_LIBRARY_PATH}")
+        endif()
+
+        # Fetch and populate ACL
        if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}")
            message(STATUS "Using ACL from specified source directory: $ENV{ACL_ROOT_DIR}")
        else()
@@ -202,43 +242,58 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
                SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild"
                SOURCE_DIR   "${FETCHCONTENT_BASE_DIR}/arm_compute-src"
                GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git
-                GIT_TAG        v52.2.0
+                GIT_TAG        v52.6.0
                GIT_SHALLOW    TRUE
                GIT_PROGRESS   TRUE
            )
            set(ENV{ACL_ROOT_DIR} "${arm_compute_SOURCE_DIR}")
+            set(ACL_LIB_DIR "$ENV{ACL_ROOT_DIR}/build")
        endif()

-        # Build ACL with scons
-        include(ProcessorCount)
-        ProcessorCount(_NPROC)
-        set(_scons_cmd
-        scons -j${_NPROC}
-            Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
-            arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
-            multi_isa=1 openmp=1 cppthreads=0
+        # Build ACL with CMake
+        set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
+        set(CMAKE_BUILD_TYPE "Release")
+        set(ARM_COMPUTE_ARCH "armv8.2-a")
+        set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
+        set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
+        set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
+        set(ARM_COMPUTE_ENABLE_OPENMP "ON")
+        set(ARM_COMPUTE_ENABLE_WERROR "OFF")
+        set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
+        set(ARM_COMPUTE_BUILD_TESTING "OFF")
+
+        set(_cmake_config_cmd
+             ${CMAKE_COMMAND} -G Ninja -B build 
+            -DARM_COMPUTE_BUILD_SHARED_LIB=OFF 
+            -DCMAKE_BUILD_TYPE=Release 
+            -DARM_COMPUTE_ARCH=armv8.2-a 
+            -DARM_COMPUTE_ENABLE_ASSERTS=OFF 
+            -DARM_COMPUTE_ENABLE_CPPTHREADS=OFF 
+            -DARM_COMPUTE_ENABLE_OPENMP=ON 
+            -DARM_COMPUTE_ENABLE_WERROR=OFF 
+            -DARM_COMPUTE_BUILD_EXAMPLES=OFF 
+            -DARM_COMPUTE_BUILD_TESTING=OFF)
+        set(_cmake_build_cmd
+            ${CMAKE_COMMAND} --build build -- -j${NPROC}
        )

-        # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
-        # and create a local shim dir with it
-        include("${CMAKE_CURRENT_LIST_DIR}/utils.cmake")
-        vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
-
-        if(NOT VLLM_TORCH_GOMP_SHIM_DIR STREQUAL "")
-            list(APPEND _scons_cmd extra_link_flags=-L${VLLM_TORCH_GOMP_SHIM_DIR})
-        endif()
-
        execute_process(
-            COMMAND ${_scons_cmd}
+            COMMAND ${_cmake_config_cmd}
+            WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
+        )
+        execute_process(
+            COMMAND ${_cmake_build_cmd}
            WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
            RESULT_VARIABLE _acl_rc
        )
+
        if(NOT _acl_rc EQUAL 0)
            message(FATAL_ERROR "ACL SCons build failed (exit ${_acl_rc}).")
        endif()
+        message(STATUS "Arm Compute Library (ACL) built successfully.")

-        set(ONEDNN_AARCH64_USE_ACL "ON")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
+        # VLLM/oneDNN settings for ACL
+        set(ONEDNN_AARCH64_USE_ACL ON CACHE BOOL "" FORCE)
        add_compile_definitions(VLLM_USE_ACL)
    endif()

@@ -255,7 +310,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
        FetchContent_Declare(
            oneDNN
            GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-            GIT_TAG v3.9
+            GIT_TAG v3.10
            GIT_PROGRESS TRUE
            GIT_SHALLOW TRUE
        )
@@ -275,7 +330,10 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
    set(ONEDNN_VERBOSE "OFF")
    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)

+    set(VLLM_BUILD_TYPE ${CMAKE_BUILD_TYPE})
+    set(CMAKE_BUILD_TYPE "Release") # remove oneDNN debug symbols to reduce size
    FetchContent_MakeAvailable(oneDNN)
+    set(CMAKE_BUILD_TYPE ${VLLM_BUILD_TYPE})
    add_library(dnnl_ext OBJECT "csrc/cpu/dnnl_helper.cpp")
    target_include_directories(
        dnnl_ext
@@ -305,18 +363,19 @@ endif()
 #
 set(VLLM_EXT_SRC
    "csrc/cpu/activation.cpp"
-    "csrc/cpu/attention.cpp"
-    "csrc/cpu/cache.cpp"
    "csrc/cpu/utils.cpp"
    "csrc/cpu/layernorm.cpp"
    "csrc/cpu/mla_decode.cpp"
    "csrc/cpu/pos_encoding.cpp"
-    "csrc/cpu/torch_bindings.cpp"
-    "csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
+    "csrc/moe/dynamic_4bit_int_moe_cpu.cpp"
+    "csrc/cpu/cpu_attn.cpp"
+    "csrc/cpu/scratchpad_manager.cpp"
+    "csrc/cpu/torch_bindings.cpp")

 if (AVX512_FOUND AND NOT AVX512_DISABLED)
    set(VLLM_EXT_SRC
        "csrc/cpu/shm.cpp"
+        "csrc/cpu/cpu_wna16.cpp"
        ${VLLM_EXT_SRC})
    if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
        set(VLLM_EXT_SRC
--- a/cmake/external_projects/triton_kernels.cmake
+++ b/cmake/external_projects/triton_kernels.cmake
@@ -0,0 +1,53 @@
+# Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels
+
+set(DEFAULT_TRITON_KERNELS_TAG "v3.5.0")
+
+# Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to
+# be directly set to the triton_kernels python directory. 
+if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
+  message(STATUS "[triton_kernels] Fetch from $ENV{TRITON_KERNELS_SRC_DIR}")
+  FetchContent_Declare(
+          triton_kernels
+          SOURCE_DIR $ENV{TRITON_KERNELS_SRC_DIR}
+  )
+
+else()
+  set(TRITON_GIT "https://github.com/triton-lang/triton.git")
+  message (STATUS "[triton_kernels] Fetch from ${TRITON_GIT}:${DEFAULT_TRITON_KERNELS_TAG}")
+  FetchContent_Declare(
+          triton_kernels
+          # TODO (varun) : Fetch just the triton_kernels directory from Triton
+          GIT_REPOSITORY https://github.com/triton-lang/triton.git
+          GIT_TAG ${DEFAULT_TRITON_KERNELS_TAG}
+          GIT_PROGRESS TRUE
+          SOURCE_SUBDIR python/triton_kernels/triton_kernels
+  )
+endif()
+
+# Fetch content 
+FetchContent_MakeAvailable(triton_kernels)
+
+if (NOT triton_kernels_SOURCE_DIR)
+  message (FATAL_ERROR "[triton_kernels] Cannot resolve triton_kernels_SOURCE_DIR")
+endif()
+
+if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
+  set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/")
+else()
+  set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/python/triton_kernels/triton_kernels/")
+endif()
+
+message (STATUS "[triton_kernels] triton_kernels is available at ${TRITON_KERNELS_PYTHON_DIR}")
+
+add_custom_target(triton_kernels)
+
+# Ensure the vllm/third_party directory exists before installation
+install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/third_party/triton_kernels\")")
+
+## Copy .py files to install directory.
+install(DIRECTORY
+        ${TRITON_KERNELS_PYTHON_DIR}
+        DESTINATION 
+        vllm/third_party/triton_kernels/
+        COMPONENT triton_kernels
+        FILES_MATCHING PATTERN "*.py")
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 8e1b01d56210dc72030a2d0d41c2d8d266ba6309
+          GIT_TAG 86f8f157cf82aa2342743752b97788922dd7de43
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -495,7 +495,13 @@ function (define_extension_target MOD_NAME)
    set(SOABI_KEYWORD "")
  endif()

-  if (ARG_USE_SABI)
+  run_python(IS_FREETHREADED_PYTHON
+    "import sysconfig; print(1 if sysconfig.get_config_var(\"Py_GIL_DISABLED\") else 0)"
+    "Failed to determine whether interpreter is free-threaded")
+
+  # Free-threaded Python doesn't yet support the stable ABI (see PEP 803/809),
+  # so avoid using the stable ABI under free-threading only.
+  if (ARG_USE_SABI AND NOT IS_FREETHREADED_PYTHON)
    Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}")
  else()
    Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}")
--- a/csrc/attention/merge_attn_states.cu
+++ b/csrc/attention/merge_attn_states.cu
@@ -16,7 +16,8 @@ __global__ void merge_attn_states_kernel(
    scalar_t* output, float* output_lse, const scalar_t* prefix_output,
    const float* prefix_lse, const scalar_t* suffix_output,
    const float* suffix_lse, const uint num_tokens, const uint num_heads,
-    const uint head_size) {
+    const uint head_size, const uint prefix_head_stride,
+    const uint output_head_stride) {
  using pack_128b_t = uint4;
  const uint pack_size = 16 / sizeof(scalar_t);
  const uint threads_per_head = head_size / pack_size;
@@ -34,11 +35,13 @@ __global__ void merge_attn_states_kernel(
  const uint head_idx = token_head_idx % num_heads;

  const uint pack_offset = pack_idx * pack_size;  // (0~15)*8, etc.
-  const uint head_offset =
-      token_idx * num_heads * head_size + head_idx * head_size;
-  const scalar_t* prefix_head_ptr = prefix_output + head_offset;
-  const scalar_t* suffix_head_ptr = suffix_output + head_offset;
-  scalar_t* output_head_ptr = output + head_offset;
+  const uint src_head_offset = token_idx * num_heads * prefix_head_stride +
+                               head_idx * prefix_head_stride;
+  const uint dst_head_offset = token_idx * num_heads * output_head_stride +
+                               head_idx * output_head_stride;
+  const scalar_t* prefix_head_ptr = prefix_output + src_head_offset;
+  const scalar_t* suffix_head_ptr = suffix_output + src_head_offset;
+  scalar_t* output_head_ptr = output + dst_head_offset;

  float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
  float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
@@ -140,7 +143,7 @@ __global__ void merge_attn_states_kernel(
            reinterpret_cast<float*>(prefix_lse.data_ptr()),                \
            reinterpret_cast<scalar_t*>(suffix_output.data_ptr()),          \
            reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens,    \
-            num_heads, head_size);                                          \
+            num_heads, head_size, prefix_head_stride, output_head_stride);  \
  }

 /*@brief Merges the attention states from prefix and suffix
@@ -166,17 +169,11 @@ void merge_attn_states_launcher(torch::Tensor& output,
  const uint num_tokens = output.size(0);
  const uint num_heads = output.size(1);
  const uint head_size = output.size(2);
+  const uint prefix_head_stride = prefix_output.stride(1);
+  const uint output_head_stride = output.stride(1);
  const uint pack_size = 16 / sizeof(scalar_t);
  TORCH_CHECK(head_size % pack_size == 0,
              "headsize must be multiple of pack_size:", pack_size);
-  TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
-              "output heads must be contiguous in memory");
-  TORCH_CHECK(
-      prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
-      "prefix_output heads must be contiguous in memory");
-  TORCH_CHECK(
-      suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
-      "suffix_output heads must be contiguous in memory");
  float* output_lse_ptr = nullptr;
  if (output_lse.has_value()) {
    output_lse_ptr = output_lse.value().data_ptr<float>();
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -41,11 +41,12 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                 const double scale, const std::string& kv_cache_dtype);

 void gather_and_maybe_dequant_cache(
-    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
-    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
-    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
-    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
-    int64_t batch_size, const std::string& kv_cache_dtype,
+    torch::Tensor const& src_cache,     // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,           // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,   // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,   // [BATCH+1]
+    torch::Tensor const& token_to_seq,  // [MAX_TOKEN_ACROSS_CHUNKS]
+    int64_t num_tokens, const std::string& kv_cache_dtype,
    torch::Tensor const& scale,
    std::optional<torch::Tensor> seq_starts = std::nullopt);

--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -552,7 +552,11 @@ __global__ void indexer_k_quant_and_cache_kernel(
 #ifndef USE_ROCM
  __syncwarp();
 #endif
+#if defined(__gfx942__)
+  float scale = fmaxf(amax, 1e-4) / 224.0f;
+#else
  float scale = fmaxf(amax, 1e-4) / 448.0f;
+#endif
  if (use_ue8m0) {
    scale = exp2f(ceilf(log2f(scale)));
  }
@@ -901,87 +905,80 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
 namespace vllm {

 // grid is launched with dimensions (batch, num_splits)
-template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt,
+          int ENTRY_SIZE, int CTA_SIZE>
 __global__ void gather_and_maybe_dequant_cache(
-    const cache_t* __restrict__ src_cache,    // [NUM_BLOCKS, BLOCK_SIZE,
-                                              // ENTRIES...]
-    scalar_t* __restrict__ dst,               // [TOT_TOKENS, ENTRIES...]
-    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
-    const int32_t* __restrict__ cu_seq_lens,  // [BATCH+1]
-    const int32_t block_size, const int32_t entry_size,
+    const cache_t* __restrict__ src_cache,     // [NUM_BLOCKS, BLOCK_SIZE,
+                                               // ENTRIES...]
+    scalar_t* __restrict__ dst,                // [TOT_TOKENS, ENTRIES...]
+    const int32_t* __restrict__ block_table,   // [BATCH, BLOCK_INDICES]
+    const int32_t* __restrict__ cu_seq_lens,   // [BATCH+1]
+    const int32_t* __restrict__ token_to_seq,  // [MAX_TOKEN_ACROSS_CHUNK]
+    const int32_t num_tokens, const int32_t block_size,
    const int64_t block_table_stride, const int64_t cache_block_stride,
    const int64_t cache_entry_stride, const int64_t dst_entry_stride,
    const float* __restrict__ scale,
    const int32_t* __restrict__ seq_starts) {  // Optional: starting offsets per
                                               // batch
+  constexpr int vec_size = sizeof(float4) / sizeof(scalar_t);
+  using ltype = vllm::vec_n_t<cache_t, vec_size>;
+  using stype = vllm::vec_n_t<scalar_t, vec_size>;
+  // We are adding this for code readability which will be optimized out when
+  // build in release.
+  assert(CTA_SIZE == blockDim.x);

-  const int64_t bid = blockIdx.x;  // Batch ID
-  const int32_t num_splits = gridDim.y;
-  const int32_t split = blockIdx.y;
-  const int32_t seq_start = cu_seq_lens[bid];
-  const int32_t seq_end = cu_seq_lens[bid + 1];
-  const int32_t seq_len = seq_end - seq_start;
-  const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size);
-  const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits);
+#pragma unroll
+  for (int token_id = blockIdx.x; token_id < num_tokens;
+       token_id += gridDim.x) {
+    int64_t batch_id = token_to_seq[token_id];
+    int64_t batch_start = cu_seq_lens[batch_id];
+    int64_t batch_end = cu_seq_lens[batch_id + 1];
+    int32_t batch_offset = token_id - batch_start;

-  const int32_t split_start = split * split_blocks;
-  const int32_t split_end = min((split + 1) * split_blocks, tot_blocks);
+    if (token_id >= batch_end) return;
+    int32_t offset = 0;
+    if (seq_starts != nullptr) {
+      offset = seq_starts[batch_id];
+    }
+    batch_offset += offset;
+    int32_t block_table_id = batch_offset / block_size;
+    int32_t slot_id = batch_offset % block_size;
+    int32_t block_table_offset = batch_id * block_table_stride + block_table_id;
+    int32_t block_id = block_table[block_table_offset];
+    int64_t cache_offset =
+        block_id * cache_block_stride + slot_id * cache_entry_stride;
+    constexpr int32_t vec_iter_cnt = ENTRY_SIZE / vec_size;
+    scalar_t* dst_ = dst + token_id * dst_entry_stride;
+    cache_t* src_ = const_cast<cache_t*>(src_cache) + cache_offset;

-  const bool is_active_split = (split_start < tot_blocks);
-  const bool is_last_split = (split_end == tot_blocks);
-
-  if (!is_active_split) return;
-
-  int32_t full_blocks_end = split_end;
-  int32_t partial_block_size = 0;
-
-  // Adjust the pointer for the block_table for this batch.
-  // If seq_starts is provided, compute an offset based on (seq_starts[bid] /
-  // page_size)
-  const int32_t batch_offset = bid * block_table_stride;
-  int32_t offset = 0;
-  if (seq_starts != nullptr) {
-    offset = seq_starts[bid] / block_size;
-  }
-  const int32_t* batch_block_table = block_table + batch_offset + offset;
-
-  // Adjust dst pointer based on the cumulative sequence lengths.
-  dst += seq_start * dst_entry_stride;
-
-  if (is_last_split) {
-    partial_block_size = seq_len % block_size;
-    if (partial_block_size) full_blocks_end -= 1;
-  }
-
-  auto copy_entry = [&](const cache_t* __restrict__ _src,
-                        scalar_t* __restrict__ _dst) {
-    for (int i = threadIdx.x; i < entry_size; i += blockDim.x) {
+#pragma unroll
+    for (int idx = threadIdx.x; idx < vec_iter_cnt; idx += CTA_SIZE) {
      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-        _dst[i] = static_cast<scalar_t>(_src[i]);
+        reinterpret_cast<stype*>(dst_)[idx] =
+            static_cast<stype>(reinterpret_cast<ltype*>(src_)[idx]);
      } else {
-        _dst[i] =
-            fp8::scaled_convert<scalar_t, cache_t, kv_dt>(_src[i], *scale);
+        ltype loaded_val = reinterpret_cast<ltype*>(src_)[idx];
+        stype store_val;
+#pragma unroll
+        for (int j = 0; j < vec_size; ++j) {
+          store_val.val[j] = fp8::scaled_convert<scalar_t, cache_t, kv_dt>(
+              loaded_val.val[j], *scale);
+        }
+        reinterpret_cast<stype*>(dst_)[idx] = store_val;
      }
    }
-  };
-
-  for (int pid = split_start; pid < full_blocks_end; ++pid) {
-    auto block_id = batch_block_table[pid];
-    auto block_start_ptr = src_cache + block_id * cache_block_stride;
-    auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
-    for (int eid = 0; eid < block_size; ++eid) {
-      copy_entry(block_start_ptr + eid * cache_entry_stride,
-                 block_dst_ptr + eid * dst_entry_stride);
-    }
-  }
-
-  if (partial_block_size) {
-    auto block_id = batch_block_table[full_blocks_end];
-    auto block_start_ptr = src_cache + block_id * cache_block_stride;
-    auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride;
-    for (int eid = 0; eid < partial_block_size; ++eid) {
-      copy_entry(block_start_ptr + eid * cache_entry_stride,
-                 block_dst_ptr + eid * dst_entry_stride);
+    // process tail
+    constexpr int32_t tail_cnt = ENTRY_SIZE % vec_size;
+    dst_ = dst_ + ENTRY_SIZE - tail_cnt;
+    src_ = src_ + ENTRY_SIZE - tail_cnt;
+#pragma unroll
+    for (int idx = threadIdx.x; idx < tail_cnt; idx += CTA_SIZE) {
+      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+        dst_[idx] = static_cast<scalar_t>(src_[idx]);
+      } else {
+        dst_[idx] =
+            fp8::scaled_convert<scalar_t, cache_t, kv_dt>(src_[idx], *scale);
+      }
    }
  }
 }
@@ -992,34 +989,38 @@ __global__ void gather_and_maybe_dequant_cache(
 // SCALAR_T is the data type of the destination tensor.
 // CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
-#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE)                      \
-  vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE>         \
-      <<<grid, block, 0, stream>>>(                                         \
-          reinterpret_cast<CACHE_T*>(src_cache.data_ptr()),                 \
-          reinterpret_cast<SCALAR_T*>(dst.data_ptr()),                      \
-          block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
-          block_size, entry_size, block_table_stride, cache_block_stride,   \
-          cache_entry_stride, dst_entry_stride,                             \
-          reinterpret_cast<const float*>(scale.data_ptr()), seq_starts_ptr);
+#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE)                        \
+  vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, 576,      \
+                                       thread_block_size>                     \
+      <<<grid, block, 0, stream>>>(                                           \
+          reinterpret_cast<CACHE_T*>(src_cache.data_ptr()),                   \
+          reinterpret_cast<SCALAR_T*>(dst.data_ptr()),                        \
+          block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(),   \
+          token_to_seq.data_ptr<int32_t>(), num_tokens, block_size,           \
+          block_table_stride, cache_block_stride, cache_entry_stride,         \
+          dst_entry_stride, reinterpret_cast<const float*>(scale.data_ptr()), \
+          seq_starts_ptr);

 // Gather sequences from the cache into the destination tensor.
 //  - cu_seq_lens contains the cumulative sequence lengths for each batch
 //  - block_table contains the cache block indices for each sequence
+//  - token_to_seq contains the back mapping from token_id to batch_id
 //  - Optionally, seq_starts (if provided) offsets the starting block index by
 //  (seq_starts[bid] / page_size)
 void gather_and_maybe_dequant_cache(
-    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
-    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
-    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
-    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
-    int64_t batch_size, const std::string& kv_cache_dtype,
+    torch::Tensor const& src_cache,     // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,           // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,   // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,   // [BATCH+1]
+    torch::Tensor const& token_to_seq,  // [MAX_TOKEN_ACROSS_CHUNKS]
+    int64_t num_tokens, const std::string& kv_cache_dtype,
    torch::Tensor const& scale,
    std::optional<torch::Tensor> seq_starts = std::nullopt) {
  at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  int32_t block_size = src_cache.size(1);
-  int32_t entry_size = src_cache.flatten(2, -1).size(2);
+  int32_t head_dim = dst.size(-1);

  TORCH_CHECK(block_table.dtype() == torch::kInt32,
              "block_table must be int32");
@@ -1029,6 +1030,9 @@ void gather_and_maybe_dequant_cache(
    TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
                "seq_starts must be int32");
  }
+  TORCH_CHECK(head_dim == 576,
+              "gather_and_maybe_dequant_cache only support the head_dim to 576 "
+              "for better performance")

  TORCH_CHECK(src_cache.device() == dst.device(),
              "src_cache and dst must be on the same device");
@@ -1046,10 +1050,9 @@ void gather_and_maybe_dequant_cache(
  int64_t cache_entry_stride = src_cache.stride(1);
  int64_t dst_entry_stride = dst.stride(0);

-  // Decide on the number of splits based on the batch size.
-  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
-  dim3 grid(batch_size, num_splits);
-  dim3 block(1024);
+  constexpr int32_t thread_block_size = 64;
+  dim3 grid(num_tokens);
+  dim3 block(thread_block_size);

  const int32_t* seq_starts_ptr =
      seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -1,798 +0,0 @@
-#include "cpu_types.hpp"
-
-namespace {
-
-template <typename scalar_t>
-struct KernelVecType {
-  using q_load_vec_type = void;
-  using q_vec_type = void;
-  using k_load_vec_type = void;
-  using k_vec_type = void;
-  using qk_acc_vec_type = void;
-  using v_load_vec_type = void;
-};
-
-template <>
-struct KernelVecType<float> {
-  using q_load_vec_type = vec_op::FP32Vec4;
-  using q_vec_type = vec_op::FP32Vec16;
-  using k_load_vec_type = vec_op::FP32Vec16;
-  using k_vec_type = vec_op::FP32Vec16;
-  using qk_acc_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::FP32Vec16;
-};
-
-template <>
-struct KernelVecType<c10::Half> {
-#if defined(__powerpc64__) || defined(__s390x__)
-  // Power and s390x architecture-specific vector types
-  using q_load_vec_type = vec_op::FP32Vec8;
-  using k_load_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::FP32Vec16;
-#else
-  // Fallback for other architectures, including x86
-  using q_load_vec_type = vec_op::FP16Vec8;
-  using k_load_vec_type = vec_op::FP16Vec16;
-  using v_load_vec_type = vec_op::FP16Vec16;
-#endif
-  using q_vec_type = vec_op::FP32Vec16;
-  using k_vec_type = vec_op::FP32Vec16;
-  using qk_acc_vec_type = vec_op::FP32Vec16;
-};
-
-#ifdef __AVX512BF16__
-template <>
-struct KernelVecType<c10::BFloat16> {
-  using q_load_vec_type = vec_op::BF16Vec8;
-  using q_vec_type = vec_op::BF16Vec32;
-  using k_load_vec_type = vec_op::BF16Vec32;
-  using k_vec_type = vec_op::BF16Vec32;
-  using qk_acc_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::BF16Vec16;
-};
-#else
-  #ifdef __aarch64__
-    #ifndef ARM_BF16_SUPPORT
-    // pass
-    #else
-template <>
-struct KernelVecType<c10::BFloat16> {
-  using q_load_vec_type = vec_op::BF16Vec8;
-  using q_vec_type = vec_op::FP32Vec16;
-  using k_load_vec_type = vec_op::BF16Vec16;
-  using k_vec_type = vec_op::FP32Vec16;
-  using qk_acc_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::BF16Vec16;
-};
-    #endif
-  #else
-template <>
-struct KernelVecType<c10::BFloat16> {
-  using q_load_vec_type = vec_op::BF16Vec8;
-  using q_vec_type = vec_op::FP32Vec16;
-  using k_load_vec_type = vec_op::BF16Vec16;
-  using k_vec_type = vec_op::FP32Vec16;
-  using qk_acc_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::BF16Vec16;
-};
-  #endif
-#endif
-
-template <typename T>
-FORCE_INLINE std::pair<T, T> reduceSoftmax(T* data, const int size,
-                                           const int capacity) {
-  T max = data[0];
-  for (int i = 1; i < size; ++i) {
-    max = max >= data[i] ? max : data[i];
-  }
-
-  T sum = 0;
-  for (int i = 0; i < size; ++i) {
-    data[i] = std::exp(data[i] - max);
-    sum += data[i];
-  }
-
-  int i = 0;
-  for (; i < size; ++i) {
-    data[i] /= sum;
-  }
-
-  for (; i < capacity; ++i) {
-    data[i] = 0;
-  }
-
-  return {max, sum};
-}
-
-template <typename T>
-FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
-                                                const int capacity,
-                                                const float alibi_slope,
-                                                const int start_index,
-                                                const int seq_len) {
-  data[0] += alibi_slope * (start_index - seq_len + 1);
-  T max = data[0];
-  for (int i = 1; i < size; ++i) {
-    T qk = data[i] + alibi_slope * (start_index + i - seq_len + 1);
-    data[i] = qk;
-    max = max >= qk ? max : qk;
-  }
-
-  T sum = 0;
-  for (int i = 0; i < size; ++i) {
-    data[i] = std::exp(data[i] - max);
-    sum += data[i];
-  }
-
-  int i = 0;
-  for (; i < size; ++i) {
-    data[i] /= sum;
-  }
-
-  for (; i < capacity; ++i) {
-    data[i] = 0;
-  }
-
-  return {max, sum};
-}
-
-template <typename T>
-FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data,
-                                         const int size) {
-  T max = max_data[0];
-  for (int i = 1; i < size; ++i) {
-    max = max >= max_data[i] ? max : max_data[i];
-  }
-
-  T rescaled_sum = 0;
-  for (int i = 0; i < size; ++i) {
-    T rescale_factor = std::exp(max_data[i] - max);
-    rescaled_sum += rescale_factor * sum_data[i];
-    sum_data[i] *= rescale_factor;
-  }
-  for (int i = 0; i < size; ++i) {
-    sum_data[i] /= rescaled_sum + 1e-8;
-  }
-}
-
-template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int x>
-struct reduceQKBlockKernel {
-  using q_load_vec_type = typename KernelVecType<scalar_t>::q_load_vec_type;
-  using q_vec_type = typename KernelVecType<scalar_t>::q_vec_type;
-  using k_load_vec_type = typename KernelVecType<scalar_t>::k_load_vec_type;
-  using k_vec_type = typename KernelVecType<scalar_t>::k_vec_type;
-  using qk_acc_vec_type = typename KernelVecType<scalar_t>::qk_acc_vec_type;
-
-  constexpr static int TOKEN_PER_GROUP = k_load_vec_type::get_elem_num() / x;
-  constexpr static int MAX_GROUP_NUM = 16 / TOKEN_PER_GROUP;
-  constexpr static int UNROLL_GROUP_NUM = MAX_GROUP_NUM / 4;
-
-  static_assert(MAX_GROUP_NUM == 8 || MAX_GROUP_NUM == 4);
-  static_assert(k_load_vec_type::get_elem_num() % x == 0);
-  static_assert(q_load_vec_type::get_elem_num() * sizeof(scalar_t) == 16);
-
-  FORCE_INLINE static void call(const scalar_t* __restrict__ q,
-                                const scalar_t* __restrict__ k_block,
-                                float* __restrict__ logits, float scale,
-                                const int token_num) {
-    const int group_num = (token_num + TOKEN_PER_GROUP - 1) / TOKEN_PER_GROUP;
-
-    qk_acc_vec_type group_accums[MAX_GROUP_NUM];
-    if (token_num == BLOCK_SIZE) {
-      for (int q_offset = 0; q_offset < HEAD_SIZE;
-           q_offset += x, k_block += x * BLOCK_SIZE) {
-        q_load_vec_type q_load_group_vec(q + q_offset);
-        q_vec_type q_group_vec(q_load_group_vec);
-
-        vec_op::unroll_loop<int, MAX_GROUP_NUM>(
-            [k_block, &q_group_vec, &group_accums](int token_group_idx) {
-              k_load_vec_type k_load_group_vec(k_block + token_group_idx * x *
-                                                             TOKEN_PER_GROUP);
-              k_vec_type k_group_vec(k_load_group_vec);
-              vec_op::fma(group_accums[token_group_idx], q_group_vec,
-                          k_group_vec);
-              vec_op::prefetch(k_block + x * BLOCK_SIZE +
-                               token_group_idx * x * TOKEN_PER_GROUP);
-            });
-      }
-    } else {
-      for (int q_offset = 0; q_offset < HEAD_SIZE;
-           q_offset += x, k_block += x * BLOCK_SIZE) {
-        q_load_vec_type q_load_group_vec(q + q_offset);
-        q_vec_type q_group_vec(q_load_group_vec);
-        for (int token_group_start = 0; token_group_start < group_num;
-             token_group_start += UNROLL_GROUP_NUM) {
-          vec_op::unroll_loop<int, UNROLL_GROUP_NUM>(
-              [token_group_start, k_block, &q_group_vec,
-               &group_accums](int token_group_idx) {
-                token_group_idx += token_group_start;
-                k_load_vec_type k_load_group_vec(k_block + token_group_idx * x *
-                                                               TOKEN_PER_GROUP);
-                k_vec_type k_group_vec(k_load_group_vec);
-                vec_op::fma(group_accums[token_group_idx], q_group_vec,
-                            k_group_vec);
-                vec_op::prefetch(k_block + x * BLOCK_SIZE +
-                                 token_group_idx * x * TOKEN_PER_GROUP);
-              });
-        }
-      }
-    }
-
-    for (int token_group_idx = 0; token_group_idx < group_num;
-         ++token_group_idx) {
-      vec_op::unroll_loop<int, TOKEN_PER_GROUP>(
-          [&group_accums, logits, scale, token_group_idx](int token_idx) {
-            float dot_v =
-                group_accums[token_group_idx]
-                    .template reduce_sub_sum<qk_acc_vec_type::get_elem_num() /
-                                             TOKEN_PER_GROUP>(token_idx);
-            logits[token_group_idx * TOKEN_PER_GROUP + token_idx] =
-                dot_v * scale;
-          });
-    }
-  }
-};
-
-template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE,
-          int HEAD_PARTITION_SIZE, typename acc_t>
-FORCE_INLINE void reduceValueBlock(const float* prob, const scalar_t* v_block,
-                                   acc_t&& acc) {
-  using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
-  constexpr int ELEM_NUM = v_load_vec_type::get_elem_num();
-  static_assert(BLOCK_SIZE == ELEM_NUM);
-  vec_op::FP32Vec16 prob_vec(prob);
-
-  vec_op::unroll_loop<int, HEAD_PARTITION_SIZE>([&](int head_elem_idx) {
-    v_load_vec_type v_vec(v_block + BLOCK_SIZE * head_elem_idx);
-    vec_op::FP32Vec16 fp32_v_vec(v_vec);
-    acc[head_elem_idx] = acc[head_elem_idx] + prob_vec * fp32_v_vec;
-  });
-}
-};  // namespace
-
-// Paged attention v1
-namespace {
-template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE>
-struct paged_attention_v1_impl {
-  static void call(
-      scalar_t* __restrict__ out,            // [num_seqs, num_heads, head_size]
-      const scalar_t* __restrict__ q,        // [num_seqs, num_heads, head_size]
-      const scalar_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
-                                             // head_size/x, block_size, x]
-      const scalar_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
-                                             // head_size, block_size]
-      const int num_kv_heads, const float scale,
-      const int* __restrict__ block_tables,  // [num_seqs,
-                                             // max_num_blocks_per_seq]
-      const int* __restrict__ seq_lens,      // [num_seqs]
-      const int max_num_blocks_per_seq,
-      const float* __restrict__ alibi_slopes,  // [num_heads]
-      const int q_stride, const int kv_block_stride, const int kv_head_stride,
-      const int num_seqs, const int num_heads) {
-    constexpr int x = 16 / sizeof(scalar_t);
-    const int num_queries_per_kv = num_heads / num_kv_heads;
-
-    static_assert(BLOCK_SIZE == 16);
-
-    int max_seq_len = max_num_blocks_per_seq * BLOCK_SIZE;
-    int max_seq_len_padded = (max_seq_len + 15) & 0xFFFFFFF0;
-    TORCH_CHECK((max_seq_len_padded * sizeof(float)) % 64 == 0);
-
-    const int parallel_work_item_num = omp_get_max_threads();
-
-    size_t logits_bytes =
-        parallel_work_item_num * max_seq_len_padded * sizeof(float);
-    float* logits = (float*)std::aligned_alloc(
-        64, logits_bytes);  // Cacheline alignment for each context token.
-                            // [parallel_work_item_num, max_seq_len_padded]
-
-#pragma omp parallel for collapse(2) schedule(dynamic, 1)
-    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
-      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-        int seq_len = seq_lens[seq_idx];
-        const int* seq_block_table =
-            block_tables + max_num_blocks_per_seq * seq_idx;
-        const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        const int64_t kv_head_idx = head_idx / num_queries_per_kv;
-        const scalar_t* __restrict__ q_vec_ptr =
-            q + seq_idx * q_stride + head_idx * HEAD_SIZE;
-        const int last_block_token_num = seq_len - (block_num - 1) * BLOCK_SIZE;
-        float* __restrict__ thread_block_logits =
-            logits + omp_get_thread_num() * max_seq_len_padded;
-
-        // Compute logits
-        for (int block_idx = 0; block_idx < block_num; ++block_idx) {
-          const int64_t physical_block_idx = seq_block_table[block_idx];
-          const scalar_t* __restrict__ k_block_cache_ptr =
-              k_cache + physical_block_idx * kv_block_stride +
-              kv_head_idx * kv_head_stride;
-          float* __restrict__ head_block_logits =
-              thread_block_logits + block_idx * BLOCK_SIZE;
-
-          reduceQKBlockKernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, x>::call(
-              q_vec_ptr, k_block_cache_ptr, head_block_logits, scale,
-              block_idx == block_num - 1 ? last_block_token_num : BLOCK_SIZE);
-        }
-
-        // Compute softmax
-        if (alibi_slopes) {
-          reduceSoftmaxAlibi(thread_block_logits, seq_len,
-                             block_num * BLOCK_SIZE, alibi_slopes[head_idx], 0,
-                             seq_len);
-        } else {
-          reduceSoftmax(thread_block_logits, seq_len, block_num * BLOCK_SIZE);
-        }
-
-        // Compute value
-        constexpr int head_elem_num_per_partition = 16;
-        constexpr int head_partition_num =
-            HEAD_SIZE / head_elem_num_per_partition;
-        for (int head_part_idx = 0; head_part_idx < head_partition_num;
-             ++head_part_idx) {
-          vec_op::FP32Vec16 accums[head_elem_num_per_partition];
-          scalar_t* __restrict__ out_ptr =
-              out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
-              head_part_idx * head_elem_num_per_partition;
-          for (int block_idx = 0; block_idx < block_num; ++block_idx) {
-            const int64_t physical_block_idx = seq_block_table[block_idx];
-            const float* __restrict__ prob_vec_ptr =
-                thread_block_logits + block_idx * BLOCK_SIZE;
-            const scalar_t* __restrict__ v_block_cache_ptr =
-                v_cache + physical_block_idx * kv_block_stride +
-                kv_head_idx * kv_head_stride +
-                BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
-            reduceValueBlock<scalar_t, HEAD_SIZE, BLOCK_SIZE,
-                             head_elem_num_per_partition>(
-                prob_vec_ptr, v_block_cache_ptr, accums);
-
-            if (block_idx != block_num - 1) {
-              const int64_t next_physical_block_idx =
-                  seq_block_table[block_idx + 1];
-              const scalar_t* __restrict__ next_v_block_cache_ptr =
-                  v_cache + next_physical_block_idx * kv_block_stride +
-                  kv_head_idx * kv_head_stride +
-                  BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
-              vec_op::unroll_loop<int, head_elem_num_per_partition>(
-                  [&](int head_elem_idx) {
-                    if (head_elem_idx % 2 == 0) {
-                      vec_op::prefetch(next_v_block_cache_ptr +
-                                       BLOCK_SIZE * head_elem_idx);
-                    }
-                  });
-            }
-          }
-
-          vec_op::unroll_loop<int, head_elem_num_per_partition>(
-              [&](int head_elem_idx) {
-                float value = accums[head_elem_idx].reduce_sum();
-                vec_op::storeFP32(value, out_ptr + head_elem_idx);
-              });
-        }
-      }
-    }
-    std::free(logits);
-  }
-};
-
-#define LAUNCH_V1_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE)                   \
-  paged_attention_v1_impl<T, HEAD_SIZE, BLOCK_SIZE>::call(                     \
-      out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
-      block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq,                  \
-      alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, num_seqs,   \
-      num_heads);
-
-template <typename T, int BLOCK_SIZE>
-void paged_attention_v1_impl_launcher(
-    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const std::optional<torch::Tensor>& alibi_slopes) {
-  int num_seqs = query.size(0);
-  int num_heads = query.size(1);
-  int head_size = query.size(2);
-  int max_num_blocks_per_seq = block_tables.size(1);
-  int q_stride = query.stride(0);
-  int kv_block_stride = key_cache.stride(0);
-  int kv_head_stride = key_cache.stride(1);
-
-  // NOTE: alibi_slopes is optional.
-  const float* alibi_slopes_ptr =
-      alibi_slopes
-          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
-          : nullptr;
-
-  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
-  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
-  T* key_cache_ptr = reinterpret_cast<T*>(key_cache.data_ptr());
-  T* value_cache_ptr = reinterpret_cast<T*>(value_cache.data_ptr());
-  int* block_tables_ptr = block_tables.data_ptr<int>();
-  int* seq_lens_ptr = seq_lens.data_ptr<int>();
-
-  switch (head_size) {
-    case 32:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
-      break;
-    case 64:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
-      break;
-    case 80:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
-      break;
-    case 96:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
-      break;
-    case 112:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
-      break;
-    case 128:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
-      break;
-    case 192:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 192, BLOCK_SIZE);
-      break;
-    case 256:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
-      break;
-    default:
-      TORCH_CHECK(false, "Unsupported head size: ", head_size);
-      break;
-  }
-}
-
-#define CALL_V1_KERNEL_LAUNCHER(T, BLOCK_SIZE)                               \
-  paged_attention_v1_impl_launcher<T, BLOCK_SIZE>(                           \
-      out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
-      seq_lens, max_seq_len, alibi_slopes);
-
-#define CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(T)                     \
-  switch (block_size) {                                           \
-    case 16:                                                      \
-      CALL_V1_KERNEL_LAUNCHER(T, 16);                             \
-      break;                                                      \
-    default:                                                      \
-      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
-      break;                                                      \
-  }
-}  // namespace
-
-void paged_attention_v1(
-    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
-    torch::Tensor& v_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step) {
-  TORCH_CHECK(blocksparse_vert_stride <= 1,
-              "CPU backend does not support blocksparse attention yet.");
-  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v1_impl",
-                               [&] {
-                                 CPU_KERNEL_GUARD_IN(paged_attention_v1_impl)
-                                 CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
-                                 CPU_KERNEL_GUARD_OUT(paged_attention_v1_impl)
-                               });
-}
-
-// Paged attention v2
-namespace {
-template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int PARTITION_SIZE>
-struct paged_attention_v2_impl {
-  static void call(
-      scalar_t* __restrict__ out,            // [num_seqs, num_heads, head_size]
-      float* __restrict__ exp_sums,          // [num_seqs, num_heads,
-                                             // max_num_partitions]
-      float* __restrict__ max_logits,        // [num_seqs, num_heads,
-                                             // max_num_partitions]
-      scalar_t* __restrict__ tmp_out,        // [num_seqs, num_heads,
-                                             // max_num_partitions, head_size]
-      const scalar_t* __restrict__ q,        // [num_seqs, num_heads, head_size]
-      const scalar_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
-                                             // head_size/x, block_size, x]
-      const scalar_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
-                                             // head_size, block_size]
-      const int num_kv_heads, const float scale,
-      const int* __restrict__ block_tables,  // [num_seqs,
-                                             // max_num_blocks_per_seq]
-      const int* __restrict__ seq_lens,      // [num_seqs]
-      const int max_num_blocks_per_seq,
-      const float* __restrict__ alibi_slopes,  // [num_heads]
-      const int q_stride, const int kv_block_stride, const int kv_head_stride,
-      const int num_seqs, const int num_heads, const int max_num_partitions) {
-    constexpr int x = 16 / sizeof(scalar_t);
-    const int num_queries_per_kv = num_heads / num_kv_heads;
-
-    static_assert(BLOCK_SIZE == 16);
-    static_assert(PARTITION_SIZE * sizeof(float) % 64 == 0);
-    static_assert(PARTITION_SIZE % BLOCK_SIZE == 0);
-
-#pragma omp parallel for collapse(3) schedule(static, 1)
-    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
-      for (int partition_idx = 0; partition_idx < max_num_partitions;
-           ++partition_idx) {
-        for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-          const int seq_len = seq_lens[seq_idx];
-          const int start_token_idx = partition_idx * PARTITION_SIZE;
-
-          if (start_token_idx >= seq_len) continue;
-
-          const int partition_num =
-              (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
-          const bool no_reduce = (partition_num == 1);
-          const int token_num =
-              (std::min(seq_len, start_token_idx + PARTITION_SIZE) -
-               start_token_idx);
-          const int block_num = (token_num + BLOCK_SIZE - 1) / BLOCK_SIZE;
-          const int last_block_token_num =
-              token_num - (block_num - 1) * BLOCK_SIZE;
-          const int* seq_block_table = block_tables +
-                                       max_num_blocks_per_seq * seq_idx +
-                                       start_token_idx / BLOCK_SIZE;
-          const int64_t kv_head_idx = head_idx / num_queries_per_kv;
-          const scalar_t* __restrict__ q_vec_ptr =
-              q + seq_idx * q_stride + head_idx * HEAD_SIZE;
-
-          float logits[PARTITION_SIZE] __attribute__((aligned(64))) = {0};
-
-          // Compute logits
-          for (int block_idx = 0; block_idx < block_num; ++block_idx) {
-            const int64_t physical_block_idx = seq_block_table[block_idx];
-            const scalar_t* __restrict__ k_block_cache_ptr =
-                k_cache + physical_block_idx * kv_block_stride +
-                kv_head_idx * kv_head_stride;
-            float* __restrict__ head_block_logits =
-                logits + block_idx * BLOCK_SIZE;
-
-            reduceQKBlockKernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, x>::call(
-                q_vec_ptr, k_block_cache_ptr, head_block_logits, scale,
-                block_idx == block_num - 1 ? last_block_token_num : BLOCK_SIZE);
-          }
-
-          std::pair<float, float> max_and_sum;
-          if (alibi_slopes) {
-            max_and_sum = reduceSoftmaxAlibi(
-                logits, token_num, block_num * BLOCK_SIZE,
-                alibi_slopes[head_idx], start_token_idx, seq_len);
-          } else {
-            max_and_sum =
-                reduceSoftmax(logits, token_num, block_num * BLOCK_SIZE);
-          }
-
-          auto&& [max_logit, exp_sum] = max_and_sum;
-
-          scalar_t* __restrict__ output_buffer = nullptr;
-          if (!no_reduce) {
-            auto idx = seq_idx * num_heads * max_num_partitions +
-                       head_idx * max_num_partitions + partition_idx;
-            max_logits[idx] = max_logit;
-            exp_sums[idx] = exp_sum;
-            output_buffer =
-                tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
-                head_idx * max_num_partitions * HEAD_SIZE +
-                partition_idx * HEAD_SIZE;
-          } else {
-            output_buffer =
-                out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
-          }
-
-          // Compute value
-          constexpr int head_elem_num_per_partition = 16;
-          constexpr int head_partition_num =
-              HEAD_SIZE / head_elem_num_per_partition;
-          for (int head_part_idx = 0; head_part_idx < head_partition_num;
-               ++head_part_idx) {
-            vec_op::FP32Vec16 accums[head_elem_num_per_partition];
-            scalar_t* __restrict__ out_ptr =
-                output_buffer + head_part_idx * head_elem_num_per_partition;
-            for (int block_idx = 0; block_idx < block_num; ++block_idx) {
-              const int64_t physical_block_idx = seq_block_table[block_idx];
-              const float* __restrict__ prob_vec_ptr =
-                  logits + block_idx * BLOCK_SIZE;
-              const scalar_t* __restrict__ v_block_cache_ptr =
-                  v_cache + physical_block_idx * kv_block_stride +
-                  kv_head_idx * kv_head_stride +
-                  BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
-              reduceValueBlock<scalar_t, HEAD_SIZE, BLOCK_SIZE,
-                               head_elem_num_per_partition>(
-                  prob_vec_ptr, v_block_cache_ptr, accums);
-
-              if (block_idx != block_num - 1) {
-                const int64_t next_physical_block_idx =
-                    seq_block_table[block_idx + 1];
-                const scalar_t* __restrict__ next_v_block_cache_ptr =
-                    v_cache + next_physical_block_idx * kv_block_stride +
-                    kv_head_idx * kv_head_stride +
-                    BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
-                vec_op::unroll_loop<int, head_elem_num_per_partition>(
-                    [&](int head_elem_idx) {
-                      if (head_elem_idx % 2 == 0) {
-                        vec_op::prefetch(next_v_block_cache_ptr +
-                                         BLOCK_SIZE * head_elem_idx);
-                      }
-                    });
-              }
-            }
-
-            vec_op::unroll_loop<int, head_elem_num_per_partition>(
-                [&](int head_elem_idx) {
-                  float value = accums[head_elem_idx].reduce_sum();
-                  vec_op::storeFP32(value, out_ptr + head_elem_idx);
-                });
-          }
-        }
-      }
-    }
-
-    // Rescale partition softmax and store the factors to exp_sums
-#pragma omp parallel for collapse(2) schedule(static, 1)
-    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
-      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-        const int seq_len = seq_lens[seq_idx];
-        const int partition_num =
-            (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
-
-        if (partition_num == 1) continue;
-
-        reducePartitionSoftmax(
-            max_logits + seq_idx * num_heads * max_num_partitions +
-                head_idx * max_num_partitions,
-            exp_sums + seq_idx * num_heads * max_num_partitions +
-                head_idx * max_num_partitions,
-            partition_num);
-      }
-    }
-
-    // Reduce values
-    using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
-    static_assert(v_load_vec_type::get_elem_num() == BLOCK_SIZE);
-    constexpr int head_elem_num_per_group =
-        16;  // Note: didn't align with the cacheline size, due to some
-             // HEAD_SIZE didn't align with 64 bytes
-    static_assert(HEAD_SIZE % head_elem_num_per_group == 0);
-    constexpr int head_group_num = HEAD_SIZE / head_elem_num_per_group;
-    const float* __restrict__ rescale_factors = exp_sums;
-#pragma omp parallel for collapse(3) schedule(static, 1)
-    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
-      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-        for (int group_idx = 0; group_idx < head_group_num; ++group_idx) {
-          const int seq_len = seq_lens[seq_idx];
-          const int partition_num =
-              (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
-
-          if (partition_num == 1) continue;
-
-          const float* __restrict__ seq_head_rescale_factors =
-              rescale_factors + seq_idx * num_heads * max_num_partitions +
-              head_idx * max_num_partitions;
-          const scalar_t* __restrict__ seq_head_tmp_out =
-              tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
-              head_idx * max_num_partitions * HEAD_SIZE +
-              group_idx * head_elem_num_per_group;
-          scalar_t* __restrict__ seq_head_output =
-              out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
-              group_idx * head_elem_num_per_group;
-
-          vec_op::FP32Vec16 acc;
-          for (int i = 0; i < partition_num; ++i) {
-            vec_op::FP32Vec16 rescale_factor(seq_head_rescale_factors[i]);
-            v_load_vec_type value(seq_head_tmp_out + i * HEAD_SIZE);
-            vec_op::FP32Vec16 fp32_value(value);
-            acc = acc + fp32_value * rescale_factor;
-          }
-          v_load_vec_type cast_acc(acc);
-          cast_acc.save(seq_head_output);
-        }
-      }
-    }
-  }
-};
-
-#define LAUNCH_V2_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE)                 \
-  paged_attention_v2_impl<T, HEAD_SIZE, BLOCK_SIZE, PARTITION_SIZE>::call(   \
-      out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr,         \
-      key_cache_ptr, value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
-      seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride,      \
-      kv_block_stride, kv_head_stride, num_seqs, num_heads,                  \
-      max_num_partitions);
-
-template <typename T, int BLOCK_SIZE, int PARTITION_SIZE = 512>
-void paged_attention_v2_impl_launcher(
-    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
-    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
-    int max_seq_len, const std::optional<torch::Tensor>& alibi_slopes) {
-  int num_seqs = query.size(0);
-  int num_heads = query.size(1);
-  int head_size = query.size(2);
-  int max_num_blocks_per_seq = block_tables.size(1);
-  int q_stride = query.stride(0);
-  int kv_block_stride = key_cache.stride(0);
-  int kv_head_stride = key_cache.stride(1);
-  int max_num_partitions = exp_sums.size(-1);
-
-  // NOTE: alibi_slopes is optional.
-  const float* alibi_slopes_ptr =
-      alibi_slopes
-          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
-          : nullptr;
-
-  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
-  float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
-  float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
-  T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
-  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
-  T* key_cache_ptr = reinterpret_cast<T*>(key_cache.data_ptr());
-  T* value_cache_ptr = reinterpret_cast<T*>(value_cache.data_ptr());
-  int* block_tables_ptr = block_tables.data_ptr<int>();
-  int* seq_lens_ptr = seq_lens.data_ptr<int>();
-
-  switch (head_size) {
-    case 32:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
-      break;
-    case 64:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
-      break;
-    case 80:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
-      break;
-    case 96:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
-      break;
-    case 112:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
-      break;
-    case 128:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
-      break;
-    case 192:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 192, BLOCK_SIZE);
-      break;
-    case 256:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
-      break;
-    default:
-      TORCH_CHECK(false, "Unsupported head size: ", head_size);
-      break;
-  }
-}
-
-#define CALL_V2_KERNEL_LAUNCHER(T, BLOCK_SIZE)                              \
-  paged_attention_v2_impl_launcher<T, BLOCK_SIZE>(                          \
-      out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,    \
-      num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len, \
-      alibi_slopes);
-
-#define CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(T)                     \
-  switch (block_size) {                                           \
-    case 16:                                                      \
-      CALL_V2_KERNEL_LAUNCHER(T, 16);                             \
-      break;                                                      \
-    default:                                                      \
-      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
-      break;                                                      \
-  }
-}  // namespace
-
-void paged_attention_v2(
-    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
-    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
-    torch::Tensor& v_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step) {
-  TORCH_CHECK(blocksparse_vert_stride <= 1,
-              "CPU backend does not support blocksparse attention yet.");
-  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v2_impl",
-                               [&] {
-                                 CPU_KERNEL_GUARD_IN(paged_attention_v2_impl)
-                                 CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
-                                 CPU_KERNEL_GUARD_OUT(paged_attention_v2_impl)
-                               });
-}
--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
@@ -1,214 +0,0 @@
-#include <map>
-#include <vector>
-
-#include "cpu_types.hpp"
-
-#if defined(__x86_64__)
-  #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2
-#else
-  #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES
-#endif
-
-namespace {
-template <typename scalar_t>
-void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches,
-                          std::vector<torch::Tensor> const& value_caches,
-                          const torch::Tensor& mapping_pairs,
-                          const int element_num_per_block,
-                          const int layer_num) {
-  const size_t pair_num = mapping_pairs.size(0);
-  const size_t block_bytes = sizeof(scalar_t) * element_num_per_block;
-#pragma omp parallel for collapse(2)
-  for (int layer = 0; layer < layer_num; ++layer) {
-    for (size_t pair = 0; pair < pair_num; ++pair) {
-      int64_t source_offset =
-          element_num_per_block * mapping_pairs[pair][0].item<int64_t>();
-      int64_t target_offset =
-          element_num_per_block * mapping_pairs[pair][1].item<int64_t>();
-      scalar_t* key_cache_ptr = key_caches[layer].data_ptr<scalar_t>();
-      scalar_t* source_ptr = key_cache_ptr + source_offset;
-      scalar_t* target_ptr = key_cache_ptr + target_offset;
-      std::memcpy(target_ptr, source_ptr, block_bytes);
-
-      scalar_t* value_cache_ptr = value_caches[layer].data_ptr<scalar_t>();
-      source_ptr = value_cache_ptr + source_offset;
-      target_ptr = value_cache_ptr + target_offset;
-      std::memcpy(target_ptr, source_ptr, block_bytes);
-    }
-  }
-}
-
-template <typename scalar_t>
-void reshape_and_cache_cpu_impl(
-    const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
-    scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
-    const int64_t* __restrict__ slot_mapping, const int num_tokens,
-    const int key_stride, const int value_stride, const int num_heads,
-    const int head_size, const int block_size, const int x) {
-  const int block_elem_num = num_heads * head_size * block_size;
-
-#pragma omp parallel for collapse(2)
-  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
-    for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-      const int64_t slot_idx = slot_mapping[token_idx];
-      if (slot_idx >= 0) {
-        int src_key_head_idx = token_idx * key_stride + head_idx * head_size;
-        int src_value_head_idx =
-            token_idx * value_stride + head_idx * head_size;
-        const scalar_t* src_key_head_ptr = key + src_key_head_idx;
-        const scalar_t* src_value_head_ptr = value + src_value_head_idx;
-        const int64_t block_index = slot_idx / block_size;
-        const int64_t block_offset = slot_idx % block_size;
-        scalar_t* target_key_head_ptr = key_cache +
-                                        block_elem_num * block_index +
-                                        head_idx * block_size * head_size;
-        scalar_t* target_value_head_ptr = value_cache +
-                                          block_elem_num * block_index +
-                                          head_idx * block_size * head_size;
-
-        for (int src_key_idx = 0; src_key_idx < head_size; src_key_idx += x) {
-          const int64_t target_offset =
-              src_key_idx * block_size + block_offset * x;
-          for (int i = 0; i < x; ++i) {
-            target_key_head_ptr[target_offset + i] =
-                src_key_head_ptr[src_key_idx + i];
-          }
-        }
-
-        for (int src_value_idx = 0; src_value_idx < head_size;
-             ++src_value_idx) {
-          const int64_t target_offset =
-              src_value_idx * block_size + block_offset;
-          target_value_head_ptr[target_offset] =
-              src_value_head_ptr[src_value_idx];
-        }
-      }
-    }
-  }
-}
-};  // namespace
-
-template <typename scalar_t>
-void concat_and_cache_mla_cpu_impl(
-    const scalar_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank]
-    const scalar_t* __restrict__ k_pe,  // [num_tokens, pe_dim]
-    scalar_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
-                                      // + pe_dim)]
-    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
-    const int num_tokens,                      //
-    const int block_stride,                    //
-    const int entry_stride,                    //
-    const int kv_c_stride,                     //
-    const int k_pe_stride,                     //
-    const int kv_lora_rank,                    //
-    const int pe_dim,                          //
-    const int block_size                       //
-) {
-#pragma omp parallel for
-  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
-    const int64_t slot_idx = slot_mapping[token_idx];
-    // NOTE: slot_idx can be -1 if the token is padded
-    if (slot_idx < 0) {
-      continue;
-    }
-    const int64_t block_idx = slot_idx / block_size;
-    const int64_t block_offset = slot_idx % block_size;
-
-    auto copy = [&](const scalar_t* __restrict__ src,
-                    scalar_t* __restrict__ dst, int src_stride, int dst_stride,
-                    int size, int offset) {
-      for (int i = 0; i < size; i++) {
-        const int64_t src_idx = token_idx * src_stride + i;
-        const int64_t dst_idx =
-            block_idx * block_stride + block_offset * entry_stride + i + offset;
-        dst[dst_idx] = src[src_idx];
-      }
-    };
-
-    copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
-    copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
-  }
-}
-
-// Note: the key_caches and value_caches vectors are constant but
-// not the Tensors they contain. The vectors need to be const refs
-// in order to satisfy pytorch's C++ operator registration code.
-void copy_blocks(std::vector<torch::Tensor> const& key_caches,
-                 std::vector<torch::Tensor> const& value_caches,
-                 const torch::Tensor& block_mapping) {
-  unsigned num_layers = key_caches.size();
-  TORCH_CHECK(num_layers == value_caches.size());
-  if (num_layers == 0) {
-    return;
-  }
-
-  const int element_num_per_block = key_caches[0][0].numel();
-  DISPATCH_MACRO(key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
-    CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
-    copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, block_mapping,
-                                   element_num_per_block, num_layers);
-    CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
-  });
-}
-
-void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
-                       torch::Tensor& key_cache, torch::Tensor& value_cache,
-                       torch::Tensor& slot_mapping,
-                       const std::string& kv_cache_dtype,
-                       torch::Tensor& k_scale, torch::Tensor& v_scale) {
-  int num_tokens = key.size(0);
-  int num_heads = key.size(1);
-  int head_size = key.size(2);
-  int block_size = key_cache.size(3);
-  int x = key_cache.size(4);
-
-  int key_stride = key.stride(0);
-  int value_stride = value.stride(0);
-
-  DISPATCH_MACRO(key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
-    CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl)
-    reshape_and_cache_cpu_impl<scalar_t>(
-        key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
-        key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
-        slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride, value_stride,
-        num_heads, head_size, block_size, x);
-    CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl)
-  });
-}
-
-void concat_and_cache_mla(
-    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
-    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
-    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
-                                  // pe_dim)]
-    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
-    const std::string& kv_cache_dtype, torch::Tensor& scale) {
-  int num_tokens = slot_mapping.size(0);
-  int kv_lora_rank = kv_c.size(1);
-  int pe_dim = k_pe.size(1);
-  int block_size = kv_cache.size(1);
-
-  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
-  TORCH_CHECK(kv_cache_dtype != "fp8");
-
-  int kv_c_stride = kv_c.stride(0);
-  int k_pe_stride = k_pe.stride(0);
-  int block_stride = kv_cache.stride(0);
-  int entry_stride = kv_cache.stride(1);
-
-  VLLM_DISPATCH_FLOATING_TYPES(
-      kv_c.scalar_type(), "concat_and_cache_mla_cpu_impl", [&] {
-        CPU_KERNEL_GUARD_IN(concat_and_cache_mla_cpu_impl)
-        concat_and_cache_mla_cpu_impl<scalar_t>(
-            kv_c.data_ptr<scalar_t>(), k_pe.data_ptr<scalar_t>(),
-            kv_cache.data_ptr<scalar_t>(), slot_mapping.data_ptr<int64_t>(),
-            num_tokens, block_stride, entry_stride, kv_c_stride, k_pe_stride,
-            kv_lora_rank, pe_dim, block_size);
-        CPU_KERNEL_GUARD_OUT(concat_and_cache_mla_cpu_impl)
-      });
-}
-
-void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
-                 const torch::Tensor& block_mapping) {
-  TORCH_CHECK(false, "swap_blocks is unsupported on CPU.")
-}
--- a/csrc/cpu/cpu_attn.cpp
+++ b/csrc/cpu/cpu_attn.cpp
@@ -0,0 +1,266 @@
+#include "cpu_attn_vec.hpp"
+#include "cpu_attn_vec16.hpp"
+
+#ifdef CPU_CAPABILITY_AMXBF16
+  #include "cpu_attn_amx.hpp"
+  #define AMX_DISPATCH(...)                                                   \
+    case cpu_attention::ISA::AMX: {                                           \
+      using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::AMX, \
+                                                     scalar_t, head_dim>;     \
+      return __VA_ARGS__();                                                   \
+    }
+#else
+  #define AMX_DISPATCH(...) case cpu_attention::ISA::AMX:
+#endif
+
+#ifdef __aarch64__
+  #include "cpu_attn_neon.hpp"
+  #define NEON_DISPATCH(...)                                                   \
+    case cpu_attention::ISA::NEON: {                                           \
+      using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::NEON, \
+                                                     scalar_t, head_dim>;      \
+      return __VA_ARGS__();                                                    \
+    }
+#else
+  #define NEON_DISPATCH(...) case cpu_attention::ISA::NEON:
+#endif  // #ifdef __aarch64__
+
+#define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \
+  case HEAD_DIM: {                            \
+    constexpr size_t head_dim = HEAD_DIM;     \
+    return __VA_ARGS__();                     \
+  }
+
+#define CPU_ATTN_DISPATCH_CASE_HEADDIM(HEAD_DIM, ...)           \
+  [&] {                                                         \
+    switch (HEAD_DIM) {                                         \
+      CPU_ATTN_DISPATCH_CASE(32, __VA_ARGS__)                   \
+      CPU_ATTN_DISPATCH_CASE(64, __VA_ARGS__)                   \
+      CPU_ATTN_DISPATCH_CASE(96, __VA_ARGS__)                   \
+      CPU_ATTN_DISPATCH_CASE(128, __VA_ARGS__)                  \
+      CPU_ATTN_DISPATCH_CASE(160, __VA_ARGS__)                  \
+      CPU_ATTN_DISPATCH_CASE(192, __VA_ARGS__)                  \
+      CPU_ATTN_DISPATCH_CASE(224, __VA_ARGS__)                  \
+      CPU_ATTN_DISPATCH_CASE(256, __VA_ARGS__)                  \
+      default: {                                                \
+        TORCH_CHECK(false, "Invalid CPU attention head_dim: " + \
+                               std::to_string(HEAD_DIM));       \
+      }                                                         \
+    }                                                           \
+  }()
+
+#define CPU_ATTN_DISPATCH_IMPL(ISA_TYPE, ...)                                 \
+  [&] {                                                                       \
+    switch (ISA_TYPE) {                                                       \
+      AMX_DISPATCH(__VA_ARGS__)                                               \
+      NEON_DISPATCH(__VA_ARGS__)                                              \
+      case cpu_attention::ISA::VEC: {                                         \
+        using attn_impl =                                                     \
+            cpu_attention::AttentionImpl<cpu_attention::ISA::VEC, scalar_t,   \
+                                         head_dim>;                           \
+        return __VA_ARGS__();                                                 \
+      }                                                                       \
+      case cpu_attention::ISA::VEC16: {                                       \
+        using attn_impl =                                                     \
+            cpu_attention::AttentionImpl<cpu_attention::ISA::VEC16, scalar_t, \
+                                         head_dim>;                           \
+        return __VA_ARGS__();                                                 \
+      }                                                                       \
+      default: {                                                              \
+        TORCH_CHECK(false, "Invalid CPU attention ISA type.");                \
+      }                                                                       \
+    }                                                                         \
+  }()
+
+torch::Tensor get_scheduler_metadata(
+    const int64_t num_req, const int64_t num_heads_q,
+    const int64_t num_heads_kv, const int64_t head_dim,
+    const torch::Tensor& seq_lens, at::ScalarType dtype,
+    const torch::Tensor& query_start_loc, const bool casual,
+    const int64_t window_size, const std::string& isa_hint,
+    const bool enable_kv_split) {
+  cpu_attention::ISA isa;
+  if (isa_hint == "amx") {
+    isa = cpu_attention::ISA::AMX;
+  } else if (isa_hint == "vec") {
+    isa = cpu_attention::ISA::VEC;
+  } else if (isa_hint == "vec16") {
+    isa = cpu_attention::ISA::VEC16;
+  } else if (isa_hint == "neon") {
+    isa = cpu_attention::ISA::NEON;
+  } else {
+    TORCH_CHECK(false, "Unsupported CPU attention ISA hint: " + isa_hint);
+  }
+
+  cpu_attention::AttentionScheduler::ScheduleInput input;
+  input.num_reqs = num_req;
+  input.num_heads_q = num_heads_q;
+  input.num_heads_kv = num_heads_kv;
+  input.head_dim = head_dim;
+  input.query_start_loc = query_start_loc.data_ptr<int32_t>();
+  input.seq_lens = seq_lens.data_ptr<int32_t>();
+  if (window_size != -1) {
+    input.left_sliding_window_size = window_size - 1;
+    if (casual) {
+      input.right_sliding_window_size = 0;
+    } else {
+      input.right_sliding_window_size = window_size - 1;
+    }
+  } else {
+    input.left_sliding_window_size = -1;
+    if (casual) {
+      input.right_sliding_window_size = 0;
+    } else {
+      input.right_sliding_window_size = -1;
+    }
+  }
+  input.casual = casual;
+  input.isa = isa;
+  input.enable_kv_split = enable_kv_split;
+  TORCH_CHECK(casual, "Only supports casual mask for now.");
+
+  VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
+    CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
+      CPU_ATTN_DISPATCH_IMPL(isa, [&]() {
+        input.elem_size = sizeof(scalar_t);
+        input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t);
+        input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t);
+        input.output_buffer_elem_size =
+            sizeof(attn_impl::partial_output_buffer_t);
+        input.max_num_q_per_iter = attn_impl::MaxQHeadNumPerIteration;
+        input.kv_block_alignment = attn_impl::BlockSizeAlignment;
+      });
+    });
+  });
+
+  cpu_attention::AttentionScheduler scheduler;
+  torch::Tensor metadata = scheduler.schedule(input);
+  return metadata;
+}
+
+void cpu_attn_reshape_and_cache(
+    const torch::Tensor& key,    // [token_num, head_num, head_size]
+    const torch::Tensor& value,  // [token_num, head_num, head_size]
+    torch::Tensor&
+        key_cache,  // [num_blocks, num_kv_heads, block_size, head_size]
+    torch::Tensor&
+        value_cache,  // [num_blocks, num_kv_heads, block_size, head_size]
+    const torch::Tensor& slot_mapping, const std::string& isa) {
+  TORCH_CHECK_EQ(key.dim(), 3);
+  TORCH_CHECK_EQ(value.dim(), 3);
+  TORCH_CHECK_EQ(key_cache.dim(), 4);
+  TORCH_CHECK_EQ(value_cache.dim(), 4);
+  TORCH_CHECK_EQ(key.stride(2), 1);
+  TORCH_CHECK_EQ(value.stride(2), 1);
+
+  const int64_t token_num = key.size(0);
+  const int64_t key_token_num_stride = key.stride(0);
+  const int64_t value_token_num_stride = value.stride(0);
+  const int64_t head_num = value.size(1);
+  const int64_t key_head_num_stride = key.stride(1);
+  const int64_t value_head_num_stride = value.stride(1);
+  const int64_t num_blocks = key_cache.size(0);
+  const int64_t num_blocks_stride = key_cache.stride(0);
+  const int64_t cache_head_num_stride = key_cache.stride(1);
+  const int64_t block_size = key_cache.size(2);
+  const int64_t block_size_stride = key_cache.stride(2);
+  const int64_t head_dim = key.size(-1);
+
+  cpu_attention::ISA isa_tag = [&]() {
+    if (isa == "amx") {
+      return cpu_attention::ISA::AMX;
+    } else if (isa == "vec") {
+      return cpu_attention::ISA::VEC;
+    } else if (isa == "vec16") {
+      return cpu_attention::ISA::VEC16;
+    } else if (isa == "neon") {
+      return cpu_attention::ISA::NEON;
+    } else {
+      TORCH_CHECK(false, "Invalid ISA type: " + isa);
+    }
+  }();
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      key.scalar_type(), "cpu_attn_reshape_and_cache", [&]() {
+        CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
+          CPU_ATTN_DISPATCH_IMPL(isa_tag, [&]() {
+            attn_impl::reshape_and_cache(
+                key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
+                key_cache.data_ptr<scalar_t>(),
+                value_cache.data_ptr<scalar_t>(),
+                slot_mapping.data_ptr<int64_t>(), token_num,
+                key_token_num_stride, value_token_num_stride, head_num,
+                key_head_num_stride, value_head_num_stride, num_blocks,
+                num_blocks_stride, cache_head_num_stride, block_size,
+                block_size_stride);
+          });
+        });
+      });
+}
+
+void cpu_attention_with_kv_cache(
+    const torch::Tensor& query,  // [num_tokens, num_heads, head_size]
+    const torch::Tensor&
+        key_cache,  // [num_blocks, num_kv_heads, block_size, head_size]
+    const torch::Tensor&
+        value_cache,        // [num_blocks, num_kv_heads, block_size, head_size]
+    torch::Tensor& output,  // [num_tokens, num_heads, head_size]
+    const torch::Tensor& query_start_loc,  // [num_tokens + 1]
+    const torch::Tensor& seq_lens,         // [num_tokens]
+    const double scale, const bool causal,
+    const std::optional<torch::Tensor>& alibi_slopes,  // [num_heads]
+    const int64_t sliding_window_left, const int64_t sliding_window_right,
+    const torch::Tensor& block_table,  // [num_tokens, max_block_num]
+    const double softcap, const torch::Tensor& scheduler_metadata,
+    const std::optional<torch::Tensor>& s_aux  // [num_heads]
+) {
+  TORCH_CHECK_EQ(query.dim(), 3);
+  TORCH_CHECK_EQ(query.stride(2), 1);
+  TORCH_CHECK_EQ(key_cache.dim(), 4);
+  TORCH_CHECK_EQ(value_cache.dim(), 4);
+
+  cpu_attention::AttentionInput input;
+  input.metadata = reinterpret_cast<cpu_attention::AttentionMetadata*>(
+      scheduler_metadata.data_ptr());
+  input.num_tokens = query.size(0);
+  input.num_heads = query.size(1);
+  input.num_kv_heads = key_cache.size(1);
+  input.block_size = key_cache.size(2);
+  input.query = query.data_ptr();
+  input.query_num_tokens_stride = query.stride(0);
+  input.query_num_heads_stride = query.stride(1);
+  input.cache_num_blocks_stride = key_cache.stride(0);
+  input.cache_num_kv_heads_stride = key_cache.stride(1);
+  input.blt_num_tokens_stride = block_table.stride(0);
+  input.key_cache = key_cache.data_ptr();
+  input.value_cache = value_cache.data_ptr();
+  input.output = output.data_ptr();
+  input.query_start_loc = query_start_loc.data_ptr<int32_t>();
+  input.seq_lens = seq_lens.data_ptr<int32_t>();
+  input.block_table = block_table.data_ptr<int32_t>();
+  input.alibi_slopes =
+      alibi_slopes.has_value() ? alibi_slopes->data_ptr<float>() : nullptr;
+  // For now sink must be bf16
+  input.s_aux = s_aux.has_value() ? s_aux->data_ptr<c10::BFloat16>() : nullptr;
+  input.scale = scale;
+  input.causal = causal;
+  input.sliding_window_left = sliding_window_left;
+  input.sliding_window_right = sliding_window_right;
+  if (input.causal) {
+    // to make boundary calculation easier
+    input.sliding_window_right = 0;
+  }
+  float softcap_fp32 = softcap;
+  input.softcap = softcap_fp32;
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      query.scalar_type(), "cpu_attention_with_kv_cache", [&]() {
+        CPU_ATTN_DISPATCH_CASE_HEADDIM(query.size(2), [&] {
+          CPU_ATTN_DISPATCH_IMPL(input.metadata->isa, [&]() {
+            TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0);
+            cpu_attention::AttentionMainLoop<attn_impl> mainloop;
+            mainloop(&input);
+          });
+        });
+      });
+}
--- a/csrc/cpu/cpu_attn_amx.hpp
+++ b/csrc/cpu/cpu_attn_amx.hpp
@@ -0,0 +1,511 @@
+#ifndef CPU_ATTN_AMX_HPP
+#define CPU_ATTN_AMX_HPP
+
+#include "cpu_attn_impl.hpp"
+
+namespace cpu_attention {
+namespace {
+// AMX specific
+constexpr static int64_t AMX_TILE_ROW_BYTES = 64;
+constexpr static int64_t AMX_TILE_ROW_NUM = 16;
+constexpr static int64_t AMX_TILE_BYTES = AMX_TILE_ROW_BYTES * AMX_TILE_ROW_NUM;
+
+typedef struct __tile_config {
+  uint8_t palette_id = 1;
+  uint8_t start_row = 0;
+  uint8_t reserved_0[14] = {0};
+  uint16_t colsb[16] = {0};
+  uint8_t rows[16] = {0};
+} __tilecfg;
+
+// 2-2-4 pattern, for 16 < m <= 32
+// TILE 0, 1: load A matrix, row num should be 16, m - 16
+// TILE 2, 3: load B matrix, row num should be 16
+// TILE 4, 5, 6, 7: store results C matrix, row num should be 16, 16, m - 16, m
+// - 16
+template <typename kv_cache_t>
+class TileGemm224 {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size, void* __restrict__ a_tile,
+                                void* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    TORCH_CHECK(false, "Unsupported kv cache type for TileGemm224");
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    TORCH_CHECK(false, "Unsupported kv cache type for TileGemm224");
+  }
+};
+
+template <>
+class TileGemm224<c10::BFloat16> {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size,
+                                c10::BFloat16* __restrict__ a_tile,
+                                c10::BFloat16* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    const int32_t k_times =
+        dynamic_k_size / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16));
+    c10::BFloat16* __restrict__ a_tile_0 = a_tile;
+    c10::BFloat16* __restrict__ a_tile_1 = a_tile + lda * AMX_TILE_ROW_NUM;
+    const int64_t a_tile_stride = [&]() {
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // q_buffer is prepacked
+        return AMX_TILE_ROW_BYTES;
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // logits_buffer is row-major
+        return lda * sizeof(c10::BFloat16);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+    }();
+
+    c10::BFloat16* __restrict__ b_tile_2 = b_tile;
+    c10::BFloat16* __restrict__ b_tile_3 = [&]() {
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // k_cache is prepacked
+        return b_tile + (k_size * AMX_TILE_ROW_BYTES / 4);
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // v_cache is prepacked
+        return b_tile + (block_size * AMX_TILE_ROW_BYTES / 4);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+    }();
+    // k_cache, v_cache are prepacked
+    const int32_t b_tile_stride = AMX_TILE_ROW_BYTES;
+
+    // logits_buffer, output_buffer are not prepacked
+    float* __restrict__ c_tile_4 = c_tile;
+    float* __restrict__ c_tile_5 =
+        c_tile_4 + AMX_TILE_ROW_BYTES / sizeof(float);
+    float* __restrict__ c_tile_6 = c_tile + AMX_TILE_ROW_NUM * ldc;
+    float* __restrict__ c_tile_7 =
+        c_tile_6 + AMX_TILE_ROW_BYTES / sizeof(float);
+    const int32_t c_tile_stride = ldc * sizeof(float);
+
+    if (accum_c) {
+      _tile_loadd(4, c_tile_4, c_tile_stride);
+      _tile_loadd(5, c_tile_5, c_tile_stride);
+      _tile_loadd(6, c_tile_6, c_tile_stride);
+      _tile_loadd(7, c_tile_7, c_tile_stride);
+    } else {
+      _tile_zero(4);
+      _tile_zero(5);
+      _tile_zero(6);
+      _tile_zero(7);
+    }
+
+    for (int32_t k = 0; k < k_times; ++k) {
+      _tile_loadd(0, a_tile_0, a_tile_stride);
+      _tile_stream_loadd(2, b_tile_2, b_tile_stride);
+      _tile_dpbf16ps(4, 0, 2);
+      _tile_stream_loadd(3, b_tile_3, b_tile_stride);
+      _tile_dpbf16ps(5, 0, 3);
+      _tile_loadd(1, a_tile_1, a_tile_stride);
+      _tile_dpbf16ps(6, 1, 2);
+      _tile_dpbf16ps(7, 1, 3);
+
+      // update ptrs
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // Q buffer is prepacked
+        a_tile_0 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
+        a_tile_1 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // P buffer is not prepacked
+        a_tile_0 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+        a_tile_1 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+      b_tile_2 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_3 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    }
+
+    _tile_stored(4, c_tile_4, c_tile_stride);
+    _tile_stored(5, c_tile_5, c_tile_stride);
+    _tile_stored(6, c_tile_6, c_tile_stride);
+    _tile_stored(7, c_tile_7, c_tile_stride);
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    const int32_t m_0 = AMX_TILE_ROW_NUM;
+    const int32_t m_1 = m - AMX_TILE_ROW_NUM;
+    config.rows[0] = m_0;
+    config.rows[1] = m_1;
+    config.rows[2] = AMX_TILE_ROW_NUM;
+    config.rows[3] = AMX_TILE_ROW_NUM;
+    config.rows[4] = m_0;
+    config.rows[5] = m_0;
+    config.rows[6] = m_1;
+    config.rows[7] = m_1;
+    _tile_loadconfig(&config);
+  }
+};
+
+// 1-2-2 pattern, for 0 < m <= 16
+// TILE 0, (1): load A matrix, use extra 1 tile for prefetch, row num should be
+// m, m
+// TILE 2, 3, (4, 5): load B matrix, use extra 2 tiles for prefetch, row
+// num should be 16
+// TILE 6, 7, (6, 7): store results C matrix, row num should be
+// m
+template <typename kv_cache_t>
+class TileGemm122 {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size, void* __restrict__ a_tile,
+                                void* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    TORCH_CHECK(false, "Unsupported kv cache type for TileGemm122");
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    TORCH_CHECK(false, "Unsupported kv cache type for TileGemm122");
+  }
+};
+
+template <>
+class TileGemm122<c10::BFloat16> {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size,
+                                c10::BFloat16* __restrict__ a_tile,
+                                c10::BFloat16* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    c10::BFloat16* __restrict__ a_tile_0 = a_tile;
+    c10::BFloat16* __restrict__ a_tile_1 = [&]() {
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // q_buffer is prepacked
+        return a_tile + AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // logits_buffer is row-major
+        return a_tile + AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+    }();
+    const int64_t a_tile_stride = [&]() {
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // q_buffer is prepacked
+        return AMX_TILE_ROW_BYTES;
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // logits_buffer is row-major
+        return lda * sizeof(c10::BFloat16);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+    }();
+
+    c10::BFloat16* __restrict__ b_tile_2 = b_tile;
+    c10::BFloat16* __restrict__ b_tile_3 = [&]() {
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // k_cache is prepacked
+        return b_tile + (k_size * AMX_TILE_ROW_BYTES / 4);
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // v_cache is prepacked
+        return b_tile + (block_size * AMX_TILE_ROW_BYTES / 4);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+    }();
+    c10::BFloat16* __restrict__ b_tile_4 =
+        b_tile_2 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    c10::BFloat16* __restrict__ b_tile_5 =
+        b_tile_3 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    int64_t b_stride = AMX_TILE_ROW_BYTES;
+
+    float* __restrict__ c_tile_6 = c_tile;
+    float* __restrict__ c_tile_7 = c_tile + AMX_TILE_ROW_BYTES / sizeof(float);
+    int64_t c_stride = ldc * sizeof(float);
+
+    const int32_t k_times =
+        dynamic_k_size / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16));
+    const int32_t k_group_times = k_times / 2;
+    const bool has_tail = (k_times % 2 == 1);
+
+    if (accum_c) {
+      _tile_loadd(6, c_tile_6, c_stride);
+      _tile_loadd(7, c_tile_7, c_stride);
+    } else {
+      _tile_zero(6);
+      _tile_zero(7);
+    }
+
+    for (int32_t k = 0; k < k_group_times; ++k) {
+      _tile_loadd(0, a_tile_0, a_tile_stride);
+      _tile_stream_loadd(2, b_tile_2, b_stride);
+      _tile_dpbf16ps(6, 0, 2);
+      _tile_stream_loadd(3, b_tile_3, b_stride);
+      _tile_dpbf16ps(7, 0, 3);
+      _tile_loadd(1, a_tile_1, a_tile_stride);
+      _tile_stream_loadd(4, b_tile_4, b_stride);
+      _tile_dpbf16ps(6, 1, 4);
+      _tile_stream_loadd(5, b_tile_5, b_stride);
+      _tile_dpbf16ps(7, 1, 5);
+
+      // update ptrs
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // Q buffer is prepacked
+        a_tile_0 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+        a_tile_1 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // P buffer is not prepacked
+        a_tile_0 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+        a_tile_1 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      }
+      b_tile_2 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_3 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_4 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_5 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    }
+
+    if (has_tail) {
+      _tile_loadd(0, a_tile_0, a_tile_stride);
+      _tile_stream_loadd(2, b_tile_2, b_stride);
+      _tile_dpbf16ps(6, 0, 2);
+      _tile_stream_loadd(3, b_tile_3, b_stride);
+      _tile_dpbf16ps(7, 0, 3);
+    }
+
+    _tile_stored(6, c_tile_6, c_stride);
+    _tile_stored(7, c_tile_7, c_stride);
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    config.rows[0] = m;
+    config.rows[1] = m;
+    config.rows[2] = AMX_TILE_ROW_NUM;
+    config.rows[3] = AMX_TILE_ROW_NUM;
+    config.rows[4] = AMX_TILE_ROW_NUM;
+    config.rows[5] = AMX_TILE_ROW_NUM;
+    config.rows[6] = m;
+    config.rows[7] = m;
+    _tile_loadconfig(&config);
+  }
+};
+}  // namespace
+
+template <typename scalar_t, int64_t head_dim>
+class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
+ public:
+  using query_t = scalar_t;
+  using q_buffer_t = scalar_t;
+  using kv_cache_t = scalar_t;
+  using logits_buffer_t = float;
+  using partial_output_buffer_t = float;
+  using prob_buffer_t = scalar_t;
+
+  constexpr static int64_t BlockSizeAlignment =
+      AMX_TILE_ROW_BYTES /
+      sizeof(kv_cache_t);  // KV token num unit of QK and PV phases
+  constexpr static int64_t HeadDimAlignment =
+      2 * (AMX_TILE_ROW_BYTES / 4);  // headdim num unit of PV phase
+  constexpr static int64_t MaxQHeadNumPerIteration = 32;
+  constexpr static int64_t HeadDim = head_dim;
+  constexpr static ISA ISAType = ISA::AMX;
+  constexpr static bool scale_on_logits = true;
+
+ public:
+  AttentionImpl() : current_q_head_num_(0) {
+    // Use all columns in AMX tiles
+    vec_op::unroll_loop<int, 8>([&](int i) { amx_tile_config_.colsb[i] = 64; });
+  }
+
+  ~AttentionImpl() { _tile_release(); }
+
+  template <template <typename tile_gemm_t> typename attention>
+  FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    if (q_head_num > AMX_TILE_ROW_NUM) {
+      if (q_head_num != current_q_head_num_) {
+        current_q_head_num_ = q_head_num;
+        TileGemm224<kv_cache_t>::init_tile_config(q_head_num, amx_tile_config_);
+      }
+      attention<TileGemm224<kv_cache_t>> attention_iteration;
+      attention_iteration(CPU_ATTENTION_PARAMS);
+    } else {
+      if (q_head_num != current_q_head_num_) {
+        current_q_head_num_ = q_head_num;
+        TileGemm122<kv_cache_t>::init_tile_config(q_head_num, amx_tile_config_);
+      }
+      attention<TileGemm122<kv_cache_t>> attention_iteration;
+      attention_iteration(CPU_ATTENTION_PARAMS);
+    }
+  }
+
+  // k_cache_token_group_stride: stride of K cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t k_cache_token_group_stride(
+      const int32_t block_size) {
+    return BlockSizeAlignment * head_dim;
+  }
+
+  // v_cache_token_group_stride: stride of V cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t v_cache_token_group_stride(
+      const int32_t block_size) {
+    return BlockSizeAlignment * (AMX_TILE_ROW_BYTES / 4);
+  }
+
+  // v_cache_head_group_stride: stride of V cache when move to next
+  // HeadDimAlignment head dims in a block
+  constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
+    return block_size * HeadDimAlignment;
+  }
+
+  static void copy_q_heads_tile(
+      scalar_t* __restrict__ src,  // [q_num, q_heads_per_kv, head_size]
+      scalar_t* __restrict__ q_buffer, const int32_t q_num,
+      const int32_t q_heads_per_kv, const int64_t q_num_stride,
+      const int64_t q_head_stride, const float scale) {
+    constexpr int64_t bytes_per_head = head_dim * sizeof(scalar_t);
+    static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
+    constexpr int64_t head_size_block_num = bytes_per_head / AMX_TILE_ROW_BYTES;
+    constexpr int64_t head_elem_num_pre_block =
+        AMX_TILE_ROW_BYTES / sizeof(scalar_t);
+
+    int32_t idx = 0;
+    int8_t* __restrict__ q_buffer_iter = reinterpret_cast<int8_t*>(q_buffer);
+    for (int32_t q_num_idx = 0; q_num_idx < q_num;
+         ++q_num_idx, src += q_num_stride) {
+      scalar_t* __restrict__ src_iter = src;
+      for (int32_t q_head_idx = 0; q_head_idx < q_heads_per_kv;
+           ++q_head_idx, src_iter += q_head_stride) {
+        vec_op::unroll_loop<int32_t, head_size_block_num>(
+            [&](int32_t head_size_block_idx) {
+              // Use INT8Vec64 for 64 bytes block
+              vec_op::INT8Vec64 vec(src_iter + head_size_block_idx *
+                                                   head_elem_num_pre_block);
+              vec.save(q_buffer_iter + head_size_block_idx * AMX_TILE_BYTES);
+            });
+
+        ++idx;
+        q_buffer_iter += AMX_TILE_ROW_BYTES;
+        if ((idx & (AMX_TILE_ROW_NUM - 1)) == 0) {
+          // head is in another amx tile
+          q_buffer_iter -= AMX_TILE_ROW_NUM * AMX_TILE_ROW_BYTES;
+          q_buffer_iter += head_size_block_num * AMX_TILE_BYTES;
+        }
+      }
+    }
+  }
+
+  // reshape KV to AMX friendly layout
+  static void reshape_and_cache(
+      const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
+      scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
+      const int64_t* __restrict__ slot_mapping, const int64_t token_num,
+      const int64_t key_token_num_stride, const int64_t value_token_num_stride,
+      const int64_t head_num, const int64_t key_head_num_stride,
+      const int64_t value_head_num_stride, const int64_t num_blocks,
+      const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
+      const int64_t block_size, const int64_t block_size_stride) {
+    // For AMX 2D tiles, size of each line is 64 bytes
+    constexpr int64_t amx_tile_row_size = AMX_TILE_ROW_BYTES;
+    // For AMX B martix, N always is 16
+    constexpr int64_t amx_b_tile_n_size = AMX_TILE_ROW_BYTES / 4;
+    constexpr int64_t amx_b_tile_k_size = amx_tile_row_size / sizeof(scalar_t);
+    // For now suppose block_size is divisible by amx_tile_column_num
+    TORCH_CHECK_EQ(block_size % amx_b_tile_k_size, 0);
+
+#pragma omp parallel for collapse(2)
+    for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
+      for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
+        const int64_t pos = slot_mapping[token_idx];
+        if (pos < 0) {
+          // skip
+          continue;
+        }
+
+        const int64_t block_idx = pos / block_size;
+        const int64_t block_offset = pos % block_size;
+        {
+          // Write Key
+          // Head elements should be packed as quand-words and stored in token
+          // groups with (quadword_stride/4) tokens
+          constexpr int64_t token_num_per_group = amx_tile_row_size / 4;
+          static_assert(head_dim % (4 / sizeof(scalar_t)) == 0);
+          constexpr int64_t quadword_num = head_dim / (4 / sizeof(scalar_t));
+          const int32_t* key_start_quadword_ptr =
+              reinterpret_cast<const int32_t*>(
+                  key + token_idx * key_token_num_stride +
+                  head_idx * key_head_num_stride);
+          const int64_t group_idx = block_offset / token_num_per_group;
+          const int64_t group_offset = block_offset % token_num_per_group;
+          constexpr int64_t quadword_num_per_group =
+              token_num_per_group * quadword_num;
+          int32_t* key_cache_start_ptr =
+              reinterpret_cast<int32_t*>(key_cache +
+                                         block_idx * num_blocks_stride +
+                                         head_idx * cache_head_num_stride) +
+              group_idx * quadword_num_per_group + group_offset;
+
+#pragma GCC unroll 8
+          for (int64_t i = 0, j = 0; j < quadword_num;
+               i += token_num_per_group, ++j) {
+            key_cache_start_ptr[i] = key_start_quadword_ptr[j];
+          }
+        }
+        {
+          // Write Value
+          // Different from Key, block_size dimension is packed rather than
+          // head_size dimension block_size dimension is packed as quand-words;
+          constexpr int64_t token_num_per_sub_group = 4 / sizeof(scalar_t);
+          const int64_t token_num_per_group = block_size;
+          constexpr int64_t head_elems_per_group = amx_b_tile_n_size;
+          const int64_t group_size = token_num_per_group * head_elems_per_group;
+          // For now suppose head_dim is divisible by amx_b_tile_n_size
+          static_assert(head_dim % head_elems_per_group == 0);
+          constexpr int64_t group_num = head_dim / head_elems_per_group;
+          const int64_t sub_group_idx = block_offset / token_num_per_sub_group;
+          const int64_t sub_group_offset =
+              block_offset % token_num_per_sub_group;
+
+          const scalar_t* value_start_ptr = value +
+                                            token_idx * value_token_num_stride +
+                                            head_idx * value_head_num_stride;
+          scalar_t* value_cache_start_ptr =
+              value_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride +
+              sub_group_idx * token_num_per_sub_group * amx_b_tile_n_size +
+              sub_group_offset;
+
+          for (int64_t i = 0; i < group_num; ++i) {
+#pragma GCC unroll head_elems_per_group
+            for (int64_t j = 0, k = 0; j < head_elems_per_group;
+                 ++j, k += token_num_per_sub_group) {
+              value_cache_start_ptr[k] = value_start_ptr[j];
+            }
+            value_start_ptr += head_elems_per_group;
+            value_cache_start_ptr += group_size;
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  alignas(64) __tilecfg amx_tile_config_;
+  int32_t current_q_head_num_;
+};
+}  // namespace cpu_attention
+
+#endif
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
--- a/csrc/cpu/cpu_attn_macros.h
+++ b/csrc/cpu/cpu_attn_macros.h
@@ -0,0 +1,63 @@
+#ifndef CPU_ATTN_MACROS_H
+#define CPU_ATTN_MACROS_H
+
+// x86_64
+#ifdef __x86_64__
+  #define FAST_SPINNING _mm_pause();
+
+  #ifdef __AVX512F__
+    #define DEFINE_FAST_EXP                                                    \
+      const __m512 vec_factorial_1 = _mm512_set1_ps(0.999999701f);             \
+      const __m512 vec_factorial_2 = _mm512_set1_ps(0.499991506f);             \
+      const __m512 vec_factorial_3 = _mm512_set1_ps(0.166676521f);             \
+      const __m512 vec_factorial_4 = _mm512_set1_ps(0.0418978221f);            \
+      const __m512 vec_factorial_5 = _mm512_set1_ps(0.00828929059f);           \
+      const __m512 vec_exp_log2ef =                                            \
+          _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b));                  \
+      const __m512 vec_half = _mm512_set1_ps(0.5f);                            \
+      const __m512 vec_one = _mm512_set1_ps(1.f);                              \
+      const __m512 vec_zero = _mm512_set1_ps(0.f);                             \
+      const __m512 vec_two = _mm512_set1_ps(2.f);                              \
+      const __m512 vec_ln2f =                                                  \
+          _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218));                  \
+      const __m512 vec_ln_flt_min =                                            \
+          _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));                  \
+      const __m512 vec_ln_flt_max =                                            \
+          _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));                  \
+      const __m512i vec_127 = _mm512_set1_epi32(0x0000007f);                   \
+      const int n_mantissa_bits = 23;                                          \
+      auto fast_exp = [&](vec_op::FP32Vec16& vec) __attribute__((              \
+                          always_inline)) {                                    \
+        __m512 values = vec.reg;                                               \
+        auto less_ln_flt_min_mask =                                            \
+            _mm512_cmp_ps_mask(values, vec_ln_flt_min, 1 /*_CMP_LT_OS*/);      \
+        auto vec_src = _mm512_min_ps(values, vec_ln_flt_max);                  \
+        vec_src = _mm512_max_ps(vec_src, vec_ln_flt_min);                      \
+        auto vec_fx = _mm512_fmadd_ps(vec_src, vec_exp_log2ef, vec_half);      \
+        auto vec_fx_i = _mm512_cvt_roundps_epi32(                              \
+            vec_fx, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);                \
+        vec_fx = _mm512_cvtepi32_ps(vec_fx_i);                                 \
+        auto vec_exp_poly = _mm512_fnmadd_ps(vec_fx, vec_ln2f, vec_src);       \
+        auto vec_res =                                                         \
+            _mm512_fmadd_ps(vec_exp_poly, vec_factorial_5, vec_factorial_4);   \
+        vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_3);     \
+        vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_2);     \
+        vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_1);     \
+        vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_one);             \
+        auto vec_exp_number = _mm512_sub_ps(vec_fx, vec_one);                  \
+        auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number);            \
+        auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127);    \
+        vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); \
+        auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i);             \
+        vec_two_pow_n = _mm512_mask_blend_ps(less_ln_flt_min_mask,             \
+                                             vec_two_pow_n, vec_zero);         \
+        vec_res = _mm512_mul_ps(vec_res, vec_two_pow_n);                       \
+        vec_res = _mm512_mul_ps(vec_res, vec_two);                             \
+        vec_op::FP32Vec16 res(vec_res);                                        \
+        return res;                                                            \
+      };
+  #endif
+
+#endif
+
+#endif
--- a/csrc/cpu/cpu_attn_neon.hpp
+++ b/csrc/cpu/cpu_attn_neon.hpp
@@ -0,0 +1,386 @@
+#ifndef CPU_ATTN_NEON_HPP
+#define CPU_ATTN_NEON_HPP
+
+#include "cpu_attn_impl.hpp"
+#include <arm_neon.h>
+#include <type_traits>
+namespace cpu_attention {
+
+namespace {
+
+#define BLOCK_SIZE_ALIGNMENT 32
+#define HEAD_SIZE_ALIGNMENT 32
+#define MAX_Q_HEAD_NUM_PER_ITER 16
+
+// These do not use vectorized class for loading / converting
+// because csrc/cpu/cpu_types_arm.hpp does not have fallback options
+// for vec_op::BF16Vec* / vec_op::BF16Vec* on Arm HW that
+// doesn't support BF16.
+// We don't use vec_op::FP32Vec* or vec_op::FP16Vec* for consistency.
+template <typename kv_cache_t>
+FORCE_INLINE void load_row8_B_as_f32(const kv_cache_t* p, float32x4_t& b0,
+                                     float32x4_t& b1);
+
+template <>
+FORCE_INLINE void load_row8_B_as_f32<float>(const float* p, float32x4_t& b0,
+                                            float32x4_t& b1) {
+  b0 = vld1q_f32(p + 0);
+  b1 = vld1q_f32(p + 4);
+}
+
+template <>
+FORCE_INLINE void load_row8_B_as_f32<c10::Half>(const c10::Half* p,
+                                                float32x4_t& b0,
+                                                float32x4_t& b1) {
+  const float16_t* h = reinterpret_cast<const float16_t*>(p);
+  float16x8_t v = vld1q_f16(h);
+  b0 = vcvt_f32_f16(vget_low_f16(v));
+  b1 = vcvt_f32_f16(vget_high_f16(v));
+}
+
+template <>
+FORCE_INLINE void load_row8_B_as_f32<c10::BFloat16>(const c10::BFloat16* p,
+                                                    float32x4_t& b0,
+                                                    float32x4_t& b1) {
+  const uint16_t* u = reinterpret_cast<const uint16_t*>(p);
+#ifdef ARM_BF16_SUPPORT
+  uint16x8_t u0 = vld1q_u16(u);
+  bfloat16x8_t bf0 = vreinterpretq_bf16_u16(u0);
+  b0 = vcvtq_low_f32_bf16(bf0);
+  b1 = vcvtq_high_f32_bf16(bf0);
+#else
+  uint16x8_t x0 = vld1q_u16(u);
+  uint32x4_t lo = vshlq_n_u32(vmovl_u16(vget_low_u16(x0)), 16);
+  uint32x4_t hi = vshlq_n_u32(vmovl_u16(vget_high_u16(x0)), 16);
+  b0 = vreinterpretq_f32_u32(lo);
+  b1 = vreinterpretq_f32_u32(hi);
+#endif
+}
+
+// Mx8, with 1 <= M <= 8 , K streamed, unroll-by-4 with NEON FMLAs
+// #Loads = (K // 4) * (M + 4 * sizeof(kv_cache_t) / 2)
+// #FMLAs = (K // 4) * (4 * 2 * M)
+// We have (4 * 2 * M) FMLAs for (M + 4 * sizeof(kv_cache_t) / 2) loads
+template <int32_t M, typename kv_cache_t>
+FORCE_INLINE void gemm_micro_neon_fmla_Mx8_Ku4(
+    const float* __restrict A,       // [M x K],
+    const kv_cache_t* __restrict B,  // [K x 8],
+    float* __restrict C,             // [M x 8],
+    int64_t lda, int64_t ldb, int64_t ldc, int32_t K, bool accumulate) {
+  // kernel supports max M of 8, as it'd spill for larger M
+  static_assert(1 <= M && M <= 8, "M must be in [1,8]");
+
+// helpers for per-M codegen
+#define ROWS_APPLY(OP) OP(0) OP(1) OP(2) OP(3) OP(4) OP(5) OP(6) OP(7)
+#define IF_M(i) if constexpr (M > (i))
+
+  // A row base pointers
+#define DECL_A(i) const float* a##i = A + (i) * lda;
+  ROWS_APPLY(DECL_A)
+#undef DECL_A
+
+  // declare 2 accumulators per row of M
+#define DECL_ACC(i) float32x4_t acc##i##_0, acc##i##_1;
+  ROWS_APPLY(DECL_ACC)
+#undef DECL_ACC
+
+  // initialize accumulators
+#define INIT_ACC(i)                              \
+  IF_M(i) {                                      \
+    if (accumulate) {                            \
+      acc##i##_0 = vld1q_f32(C + (i) * ldc + 0); \
+      acc##i##_1 = vld1q_f32(C + (i) * ldc + 4); \
+    } else {                                     \
+      acc##i##_0 = vdupq_n_f32(0.f);             \
+      acc##i##_1 = vdupq_n_f32(0.f);             \
+    }                                            \
+  }
+  ROWS_APPLY(INIT_ACC)
+#undef INIT_ACC
+
+  int32_t k = 0;
+
+  // K unrolled by 4
+  for (; k + 3 < K; k += 4) {
+    // load A[k..k+3] for each active row (M)
+#define LOAD_A4(i)     \
+  float32x4_t a##i##v; \
+  IF_M(i) a##i##v = vld1q_f32(a##i + k);
+    ROWS_APPLY(LOAD_A4)
+#undef LOAD_A4
+
+    // helper: FMA lane L from aiv
+#define FMAS_LANE(i, aiv, L)                              \
+  IF_M(i) {                                               \
+    acc##i##_0 = vfmaq_laneq_f32(acc##i##_0, b0, aiv, L); \
+    acc##i##_1 = vfmaq_laneq_f32(acc##i##_1, b1, aiv, L); \
+  }
+
+    // k + 0
+    {
+      float32x4_t b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 0) * ldb, b0, b1);
+#define STEP_K0(i) FMAS_LANE(i, a##i##v, 0)
+      ROWS_APPLY(STEP_K0)
+#undef STEP_K0
+    }
+    // k + 1
+    {
+      float32x4_t b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 1) * ldb, b0, b1);
+#define STEP_K1(i) FMAS_LANE(i, a##i##v, 1)
+      ROWS_APPLY(STEP_K1)
+#undef STEP_K1
+    }
+    // k + 2
+    {
+      float32x4_t b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 2) * ldb, b0, b1);
+#define STEP_K2(i) FMAS_LANE(i, a##i##v, 2)
+      ROWS_APPLY(STEP_K2)
+#undef STEP_K2
+    }
+    // k + 3
+    {
+      float32x4_t b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 3) * ldb, b0, b1);
+#define STEP_K3(i) FMAS_LANE(i, a##i##v, 3)
+      ROWS_APPLY(STEP_K3)
+#undef STEP_K3
+    }
+#undef FMAS_LANE
+  }
+
+  // K tail
+  for (; k < K; ++k) {
+    float32x4_t b0, b1;
+    load_row8_B_as_f32<kv_cache_t>(B + (int64_t)k * ldb, b0, b1);
+#define TAIL_ROW(i)                             \
+  IF_M(i) {                                     \
+    float32x4_t ai = vdupq_n_f32(*(a##i + k));  \
+    acc##i##_0 = vfmaq_f32(acc##i##_0, b0, ai); \
+    acc##i##_1 = vfmaq_f32(acc##i##_1, b1, ai); \
+  }
+    ROWS_APPLY(TAIL_ROW)
+#undef TAIL_ROW
+  }
+
+  // store accumulators to C
+#define STORE_ROW(i)                          \
+  IF_M(i) {                                   \
+    vst1q_f32(C + (i) * ldc + 0, acc##i##_0); \
+    vst1q_f32(C + (i) * ldc + 4, acc##i##_1); \
+  }
+  ROWS_APPLY(STORE_ROW)
+#undef STORE_ROW
+
+#undef ROWS_APPLY
+#undef IF_M
+}
+
+template <int32_t N, typename kv_cache_t>
+FORCE_INLINE void gemm_macro_neon_fmla_Mx8_Ku4(const float* __restrict A,
+                                               const kv_cache_t* __restrict B,
+                                               float* __restrict C, int32_t M,
+                                               int32_t K, int64_t lda,
+                                               int64_t ldb, int64_t ldc,
+                                               bool accumulate) {
+  // micro kernel is Mx8
+  static_assert(N % 8 == 0, "N must be a multiple of 8");
+  for (int32_t m = 0; m < M;) {
+    int32_t mb = (M - m >= 8) ? 8 : (M - m >= 4) ? 4 : (M - m >= 2) ? 2 : 1;
+    const float* Ab = A + m * lda;
+    float* Cb = C + m * ldc;
+
+    for (int32_t n = 0; n < N; n += 8) {
+      const kv_cache_t* Bn = B + n;
+      float* Cn = Cb + n;
+      switch (mb) {
+        case 8:
+          gemm_micro_neon_fmla_Mx8_Ku4<8, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                      K, accumulate);
+          break;
+        case 4:
+          gemm_micro_neon_fmla_Mx8_Ku4<4, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                      K, accumulate);
+          break;
+        case 2:
+          gemm_micro_neon_fmla_Mx8_Ku4<2, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                      K, accumulate);
+          break;
+        default:
+          gemm_micro_neon_fmla_Mx8_Ku4<1, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                      K, accumulate);
+          break;
+      }
+    }
+    // no tail loop for N as it's guaranteed to be a multiple of 8
+    m += mb;
+  }
+}
+
+template <typename kv_cache_t>
+class TileGemmNeonFMLA {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size,
+                                float* __restrict__ a_tile,
+                                kv_cache_t* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    if constexpr (phase == AttentionGemmPhase::QK) {
+      gemm_macro_neon_fmla_Mx8_Ku4<BLOCK_SIZE_ALIGNMENT, kv_cache_t>(
+          a_tile, b_tile, c_tile, m_size, k_size, lda, ldb, ldc, accum_c);
+    } else {
+      gemm_macro_neon_fmla_Mx8_Ku4<HEAD_SIZE_ALIGNMENT, kv_cache_t>(
+          a_tile, b_tile, c_tile, m_size, dynamic_k_size, lda, ldb, ldc,
+          accum_c);
+    }
+  }
+};
+
+}  // namespace
+
+// this is similar to "ISA::VEC" at the moment
+template <typename scalar_t, int64_t head_dim>
+class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
+ public:
+  using query_t = scalar_t;
+  using q_buffer_t = float;
+  using kv_cache_t = scalar_t;
+  using logits_buffer_t = float;
+  using partial_output_buffer_t = float;
+  using prob_buffer_t = float;
+
+  constexpr static int64_t BlockSizeAlignment =
+      BLOCK_SIZE_ALIGNMENT;  // KV token num unit of QK and PV phases
+  constexpr static int64_t HeadDimAlignment =
+      HEAD_SIZE_ALIGNMENT;  // headdim num unit of PV phase
+  constexpr static int64_t MaxQHeadNumPerIteration = MAX_Q_HEAD_NUM_PER_ITER;
+  constexpr static int64_t HeadDim = head_dim;
+  constexpr static ISA ISAType = ISA::NEON;
+  constexpr static bool scale_on_logits = false;  // apply scale on q_buffer
+
+  static_assert(HeadDim % HeadDimAlignment == 0);
+  // the gemm micro kernel is Mx8
+  static_assert(HeadDimAlignment % 8 == 0);
+  static_assert(BlockSizeAlignment % 8 == 0);
+
+ public:
+  template <template <typename tile_gemm_t> typename attention>
+  FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    attention<TileGemmNeonFMLA<kv_cache_t>> attention_iteration;
+    attention_iteration(CPU_ATTENTION_PARAMS);
+  }
+
+  // k_cache_token_group_stride: stride of K cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t k_cache_token_group_stride(
+      const int32_t block_size) {
+    return BlockSizeAlignment;  // layout of k_cache block is [head_dim,
+                                // block_size], row-major
+  }
+
+  // v_cache_token_group_stride: stride of V cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t v_cache_token_group_stride(
+      const int32_t block_size) {
+    return head_dim * BlockSizeAlignment;  // layout of v_cache is [block_size,
+                                           // head_dim], row-major
+  }
+
+  // v_cache_head_group_stride: stride of V cache when move to next
+  // HeadDimAlignment head dims in a block
+  constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
+    return HeadDimAlignment;  // layout of v_cache is [block_size, head_dim],
+                              // row-major
+  }
+
+  // Copy q to q_buffer and cast it to fp32
+  static void copy_q_heads_tile(
+      scalar_t* __restrict__ src,  // [q_num, q_heads_per_kv, head_size]
+      float* __restrict__ q_buffer, const int32_t q_num,
+      const int32_t q_heads_per_kv, const int64_t q_num_stride,
+      const int64_t q_head_stride, float scale) {
+    static_assert(head_dim % 16 == 0);
+    constexpr int32_t unroll_size = head_dim / 16;
+    using load_vec_t = typename VecTypeTrait<scalar_t>::vec_t;
+
+    vec_op::FP32Vec16 scale_vec(scale);
+    for (int32_t q_num_idx = 0; q_num_idx < q_num; ++q_num_idx) {
+      for (int32_t q_head_idx = 0; q_head_idx < q_heads_per_kv; ++q_head_idx) {
+        scalar_t* __restrict__ curr_q =
+            src + q_num_idx * q_num_stride + q_head_idx * q_head_stride;
+        float* __restrict__ curr_q_buffer =
+            q_buffer + q_num_idx * q_heads_per_kv * head_dim +
+            q_head_idx * head_dim;
+
+        vec_op::unroll_loop<int32_t, unroll_size>([&](int32_t i) {
+          load_vec_t vec(curr_q);
+          vec_op::FP32Vec16 fp32_vec(vec);
+          fp32_vec = fp32_vec * scale_vec;
+          fp32_vec.save(curr_q_buffer);
+
+          curr_q += 16;
+          curr_q_buffer += 16;
+        });
+      }
+    }
+  }
+
+  // reshape K as column-major and V as row-major
+  static void reshape_and_cache(
+      const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
+      scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
+      const int64_t* __restrict__ slot_mapping, const int64_t token_num,
+      const int64_t key_token_num_stride, const int64_t value_token_num_stride,
+      const int64_t head_num, const int64_t key_head_num_stride,
+      const int64_t value_head_num_stride, const int64_t num_blocks,
+      const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
+      const int64_t block_size, const int64_t block_size_stride) {
+#pragma omp parallel for collapse(2)
+    for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
+      for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
+        const int64_t pos = slot_mapping[token_idx];
+        if (pos < 0) {
+          // skip
+          continue;
+        }
+
+        const int64_t block_idx = pos / block_size;
+        const int64_t block_offset = pos % block_size;
+        {
+          // Write Key
+          const scalar_t* key_start_ptr = key +
+                                          token_idx * key_token_num_stride +
+                                          head_idx * key_head_num_stride;
+          scalar_t* key_cache_start_ptr =
+              key_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride + block_offset;
+
+#pragma GCC unroll 8
+          for (int64_t i = 0, j = 0; i < head_dim; ++i, j += block_size) {
+            key_cache_start_ptr[j] = key_start_ptr[i];
+          }
+        }
+        {
+          // Write Value
+          const scalar_t* value_start_ptr = value +
+                                            token_idx * value_token_num_stride +
+                                            head_idx * value_head_num_stride;
+          scalar_t* value_cache_start_ptr =
+              value_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride + block_offset * head_dim;
+          std::memcpy(value_cache_start_ptr, value_start_ptr,
+                      sizeof(scalar_t) * head_dim);
+        }
+      }
+    }
+  }
+};
+}  // namespace cpu_attention
+
+#endif  // #ifndef CPU_ATTN_NEON_HPP
--- a/csrc/cpu/cpu_attn_vec.hpp
+++ b/csrc/cpu/cpu_attn_vec.hpp
@@ -0,0 +1,248 @@
+#ifndef CPU_ATTN_VEC_HPP
+#define CPU_ATTN_VEC_HPP
+
+#include "cpu_attn_impl.hpp"
+
+namespace cpu_attention {
+
+namespace {
+// 8-2-16 pattern, 8 regs for A, 2 regs for B, 16 regs for C, [8, K] @ [k, 32]
+template <typename kv_cache_t>
+class TileGemm82 {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size,
+                                float* __restrict__ a_tile,
+                                kv_cache_t* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    switch (m_size) {
+      case 1:
+        gemm_micro<1>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 2:
+        gemm_micro<2>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 3:
+      case 4:
+        gemm_micro<4>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 5:
+      case 6:
+        gemm_micro<6>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 7:
+      case 8:
+        gemm_micro<8>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+    }
+  }
+
+  template <int32_t M>
+  static void gemm_micro(float* __restrict__ a_tile,
+                         kv_cache_t* __restrict__ b_tile,
+                         float* __restrict__ c_tile, const int64_t lda,
+                         const int64_t ldb, const int64_t ldc,
+                         const int32_t block_size, const int32_t dynamic_k_size,
+                         const bool accum_c) {
+    static_assert(0 < M <= 8);
+    using load_vec_t = typename VecTypeTrait<kv_cache_t>::vec_t;
+
+    kv_cache_t* __restrict__ curr_b_0 = b_tile;
+    kv_cache_t* __restrict__ curr_b_1 = b_tile + 16;
+    float* __restrict__ curr_c_0 = c_tile;
+    float* __restrict__ curr_c_1 = c_tile + 16;
+
+    vec_op::FP32Vec16 c_regs[M * 2];
+    if (accum_c) {
+      float* __restrict__ curr_m_c_0 = curr_c_0;
+      float* __restrict__ curr_m_c_1 = curr_c_1;
+      vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+        c_regs[i * 2] = vec_op::FP32Vec16(curr_m_c_0);
+        c_regs[i * 2 + 1] = vec_op::FP32Vec16(curr_m_c_1);
+
+        // update
+        curr_m_c_0 += ldc;
+        curr_m_c_1 += ldc;
+      });
+    }
+
+    float* __restrict__ curr_a = a_tile;
+    for (int32_t k = 0; k < dynamic_k_size; ++k) {
+      load_vec_t b_0_reg(curr_b_0);
+      vec_op::FP32Vec16 fp32_b_0_reg(b_0_reg);
+      load_vec_t b_1_reg(curr_b_1);
+      vec_op::FP32Vec16 fp32_b_1_reg(b_1_reg);
+
+      float* __restrict__ curr_m_a = curr_a;
+      vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+        float v = *curr_m_a;
+        vec_op::FP32Vec16 a_reg(v);
+        c_regs[i * 2] = c_regs[i * 2] + a_reg * fp32_b_0_reg;
+        c_regs[i * 2 + 1] = c_regs[i * 2 + 1] + a_reg * fp32_b_1_reg;
+
+        // update
+        curr_m_a += lda;
+      });
+
+      // update
+      curr_a += 1;
+      curr_b_0 += ldb;
+      curr_b_1 += ldb;
+    }
+
+    vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+      c_regs[i * 2].save(curr_c_0);
+      c_regs[i * 2 + 1].save(curr_c_1);
+
+      // update
+      curr_c_0 += ldc;
+      curr_c_1 += ldc;
+    });
+  }
+};
+}  // namespace
+
+// This is a general but naive implementation based on vector instructions
+template <typename scalar_t, int64_t head_dim>
+class AttentionImpl<ISA::VEC, scalar_t, head_dim> {
+ public:
+  using query_t = scalar_t;
+  using q_buffer_t = float;
+  using kv_cache_t = scalar_t;
+  using logits_buffer_t = float;
+  using partial_output_buffer_t = float;
+  using prob_buffer_t = float;
+
+  constexpr static int64_t BlockSizeAlignment =
+      32;  // KV token num unit of QK and PV phases
+  constexpr static int64_t HeadDimAlignment =
+      32;  // headdim num unit of PV phase
+  constexpr static int64_t MaxQHeadNumPerIteration = 8;
+  constexpr static int64_t HeadDim = head_dim;
+  constexpr static ISA ISAType = ISA::VEC;
+  constexpr static bool scale_on_logits = false;  // apply scale on q_buffer
+
+ public:
+  template <template <typename tile_gemm_t> typename attention>
+  FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    attention<TileGemm82<kv_cache_t>> attention_iteration;
+    attention_iteration(CPU_ATTENTION_PARAMS);
+  }
+
+  // k_cache_token_group_stride: stride of K cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t k_cache_token_group_stride(
+      const int32_t block_size) {
+    return BlockSizeAlignment;  // layout of k_cache block is [head_dim,
+                                // block_size], row-major
+  }
+
+  // v_cache_token_group_stride: stride of V cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t v_cache_token_group_stride(
+      const int32_t block_size) {
+    return head_dim * BlockSizeAlignment;  // layout of v_cache is [block_size,
+                                           // head_dim], row-major
+  }
+
+  // v_cache_head_group_stride: stride of V cache when move to next
+  // HeadDimAlignment head dims in a block
+  constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
+    return HeadDimAlignment;  // layout of v_cache is [block_size, head_dim],
+                              // row-major
+  }
+
+  // Copy q to q_buffer and cast it to fp32
+  static void copy_q_heads_tile(
+      scalar_t* __restrict__ src,  // [q_num, q_heads_per_kv, head_size]
+      float* __restrict__ q_buffer, const int32_t q_num,
+      const int32_t q_heads_per_kv, const int64_t q_num_stride,
+      const int64_t q_head_stride, float scale) {
+    static_assert(head_dim % 16 == 0);
+    constexpr int32_t unroll_size = head_dim / 16;
+    using load_vec_t = typename VecTypeTrait<scalar_t>::vec_t;
+
+    vec_op::FP32Vec16 scale_vec(scale);
+    for (int32_t q_num_idx = 0; q_num_idx < q_num; ++q_num_idx) {
+      for (int32_t q_head_idx = 0; q_head_idx < q_heads_per_kv; ++q_head_idx) {
+        scalar_t* __restrict__ curr_q =
+            src + q_num_idx * q_num_stride + q_head_idx * q_head_stride;
+        float* __restrict__ curr_q_buffer =
+            q_buffer + q_num_idx * q_heads_per_kv * head_dim +
+            q_head_idx * head_dim;
+
+        vec_op::unroll_loop<int32_t, unroll_size>([&](int32_t i) {
+          load_vec_t vec(curr_q);
+          vec_op::FP32Vec16 fp32_vec(vec);
+          fp32_vec = fp32_vec * scale_vec;
+          fp32_vec.save(curr_q_buffer);
+
+          curr_q += 16;
+          curr_q_buffer += 16;
+        });
+      }
+    }
+  }
+
+  // reshape K as column-major and V as row-major
+  static void reshape_and_cache(
+      const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
+      scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
+      const int64_t* __restrict__ slot_mapping, const int64_t token_num,
+      const int64_t key_token_num_stride, const int64_t value_token_num_stride,
+      const int64_t head_num, const int64_t key_head_num_stride,
+      const int64_t value_head_num_stride, const int64_t num_blocks,
+      const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
+      const int64_t block_size, const int64_t block_size_stride) {
+#pragma omp parallel for collapse(2)
+    for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
+      for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
+        const int64_t pos = slot_mapping[token_idx];
+        if (pos < 0) {
+          // skip
+          continue;
+        }
+
+        const int64_t block_idx = pos / block_size;
+        const int64_t block_offset = pos % block_size;
+        {
+          // Write Key as column-major
+          const scalar_t* key_start_ptr = key +
+                                          token_idx * key_token_num_stride +
+                                          head_idx * key_head_num_stride;
+          scalar_t* key_cache_start_ptr =
+              key_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride + block_offset;
+
+#pragma GCC unroll 8
+          for (int64_t i = 0, j = 0; i < head_dim; ++i, j += block_size) {
+            key_cache_start_ptr[j] = key_start_ptr[i];
+          }
+        }
+        {
+          // Write Value as row-major
+          const scalar_t* value_start_ptr = value +
+                                            token_idx * value_token_num_stride +
+                                            head_idx * value_head_num_stride;
+          scalar_t* value_cache_start_ptr =
+              value_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride + block_offset * head_dim;
+          std::memcpy(value_cache_start_ptr, value_start_ptr,
+                      sizeof(scalar_t) * head_dim);
+        }
+      }
+    }
+  }
+};
+}  // namespace cpu_attention
+
+#endif
--- a/csrc/cpu/cpu_attn_vec16.hpp
+++ b/csrc/cpu/cpu_attn_vec16.hpp
@@ -0,0 +1,171 @@
+#ifndef CPU_ATTN_VEC16_HPP
+#define CPU_ATTN_VEC16_HPP
+
+#include "cpu_attn_vec.hpp"
+
+namespace cpu_attention {
+
+namespace {
+// 16-1-16 pattern, 16 regs for A, 1 regs for B, 16 regs for C, [16, K] @ [k,
+// 16]
+template <typename kv_cache_t>
+class TileGemm161 {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size,
+                                float* __restrict__ a_tile,
+                                kv_cache_t* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    switch (m_size) {
+      case 1:
+        gemm_micro<1>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 2:
+        gemm_micro<2>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 3:
+      case 4:
+        gemm_micro<4>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 5:
+      case 6:
+        gemm_micro<6>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 7:
+      case 8:
+        gemm_micro<8>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 9:
+      case 10:
+      case 11:
+      case 12:
+        gemm_micro<12>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                       dynamic_k_size, accum_c);
+        break;
+      case 13:
+      case 14:
+      case 15:
+      case 16:
+        gemm_micro<16>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                       dynamic_k_size, accum_c);
+        break;
+    }
+  }
+
+  template <int32_t M>
+  static void gemm_micro(float* __restrict__ a_tile,
+                         kv_cache_t* __restrict__ b_tile,
+                         float* __restrict__ c_tile, const int64_t lda,
+                         const int64_t ldb, const int64_t ldc,
+                         const int32_t block_size, const int32_t dynamic_k_size,
+                         const bool accum_c) {
+    static_assert(0 < M <= 16);
+    using load_vec_t = typename VecTypeTrait<kv_cache_t>::vec_t;
+
+    kv_cache_t* __restrict__ curr_b_0 = b_tile;
+    float* __restrict__ curr_c_0 = c_tile;
+
+    vec_op::FP32Vec16 c_regs[M];
+    if (accum_c) {
+      float* __restrict__ curr_m_c_0 = curr_c_0;
+      vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+        c_regs[i] = vec_op::FP32Vec16(curr_m_c_0);
+
+        // update
+        curr_m_c_0 += ldc;
+      });
+    }
+
+    float* __restrict__ curr_a = a_tile;
+    for (int32_t k = 0; k < dynamic_k_size; ++k) {
+      load_vec_t b_0_reg(curr_b_0);
+      vec_op::FP32Vec16 fp32_b_0_reg(b_0_reg);
+
+      float* __restrict__ curr_m_a = curr_a;
+      vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+        float v = *curr_m_a;
+        vec_op::FP32Vec16 a_reg(v);
+        c_regs[i] = c_regs[i] + a_reg * fp32_b_0_reg;
+
+        // update
+        curr_m_a += lda;
+      });
+
+      // update
+      curr_a += 1;
+      curr_b_0 += ldb;
+    }
+
+    vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+      c_regs[i].save(curr_c_0);
+
+      // update
+      curr_c_0 += ldc;
+    });
+  }
+};
+}  // namespace
+
+// This is a general but naive implementation based on vector instructions
+template <typename scalar_t, int64_t head_dim>
+class AttentionImpl<ISA::VEC16, scalar_t, head_dim>
+    : public AttentionImpl<ISA::VEC, scalar_t, head_dim> {
+ public:
+  using query_t = scalar_t;
+  using q_buffer_t = float;
+  using kv_cache_t = scalar_t;
+  using logits_buffer_t = float;
+  using partial_output_buffer_t = float;
+  using prob_buffer_t = float;
+
+  constexpr static int64_t BlockSizeAlignment =
+      16;  // KV token num unit of QK and PV phases
+  constexpr static int64_t HeadDimAlignment =
+      16;  // headdim num unit of PV phase
+  constexpr static int64_t MaxQHeadNumPerIteration = 16;
+  constexpr static int64_t HeadDim = head_dim;
+  constexpr static ISA ISAType = ISA::VEC16;
+  constexpr static bool scale_on_logits = false;  // apply scale on q_buffer
+
+ public:
+  template <template <typename tile_gemm_t> typename attention>
+  FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    attention<TileGemm161<kv_cache_t>> attention_iteration;
+    attention_iteration(CPU_ATTENTION_PARAMS);
+  }
+
+  // k_cache_token_group_stride: stride of K cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t k_cache_token_group_stride(
+      const int32_t block_size) {
+    return BlockSizeAlignment;  // layout of k_cache block is [head_dim,
+                                // block_size], row-major
+  }
+
+  // v_cache_token_group_stride: stride of V cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t v_cache_token_group_stride(
+      const int32_t block_size) {
+    return head_dim * BlockSizeAlignment;  // layout of v_cache is [block_size,
+                                           // head_dim], row-major
+  }
+
+  // v_cache_head_group_stride: stride of V cache when move to next
+  // HeadDimAlignment head dims in a block
+  constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
+    return HeadDimAlignment;  // layout of v_cache is [block_size, head_dim],
+                              // row-major
+  }
+};
+}  // namespace cpu_attention
+
+#endif
--- a/csrc/cpu/cpu_types_scalar.hpp
+++ b/csrc/cpu/cpu_types_scalar.hpp
@@ -26,10 +26,6 @@ namespace vec_op {

 #define FORCE_INLINE __attribute__((always_inline)) inline

-#define __max(a, b) ((a) > (b) ? (a) : (b))
-#define __min(a, b) ((a) < (b) ? (a) : (b))
-#define __abs(a) ((a) < (0) ? (0 - a) : (a))
-
 typedef struct f16x8_t {
  uint16_t val[8];
 } f16x8_t;
@@ -99,7 +95,7 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
  void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }

  void save(void* ptr, const int elem_num) const {
-    int num = __min(elem_num, VEC_ELEM_NUM);
+    int num = std::min(elem_num, VEC_ELEM_NUM);
    std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
  }
 };
@@ -128,7 +124,7 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
  void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }

  void save(void* ptr, const int elem_num) const {
-    int num = __min(elem_num, VEC_ELEM_NUM);
+    int num = std::min(elem_num, VEC_ELEM_NUM);
    std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
  }
 };
@@ -143,9 +139,9 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
  explicit BF16Vec32(f16x32_t data) : reg(data) {};

  explicit BF16Vec32(BF16Vec8& vec8_data) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+    unroll_loop<int, VEC_ELEM_NUM>([&vec8_data, this](int i) {
      reg.val[i] = vec8_data.reg.val[i % BF16Vec8::VEC_ELEM_NUM];
-    }
+    });
  }

  void save(void* ptr) const { *reinterpret_cast<f16x32_t*>(ptr) = reg; }
@@ -157,15 +153,11 @@ struct FP32Vec4 : public Vec<FP32Vec4> {
  f32x4_t reg;

  explicit FP32Vec4(float v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = v;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([&v, this](int i) { reg.val[i] = v; });
  }

  explicit FP32Vec4() {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = 0.0f;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([this](int i) { reg.val[i] = 0.0f; });
  }

  explicit FP32Vec4(const float* ptr)
@@ -182,15 +174,11 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
  f32x8_t reg;

  explicit FP32Vec8(float v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = v;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([&v, this](int i) { reg.val[i] = v; });
  }

  explicit FP32Vec8() {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = 0.0f;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([this](int i) { reg.val[i] = 0.0f; });
  }

  explicit FP32Vec8(const float* ptr)
@@ -201,78 +189,68 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
  explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {};

  explicit FP32Vec8(const FP16Vec8& v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = fp16_to_float(v.reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&v, this](int i) { reg.val[i] = fp16_to_float(v.reg.val[i]); });
  }

  FP32Vec8(const BF16Vec8& v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = bf16_to_float(v.reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&v, this](int i) { reg.val[i] = bf16_to_float(v.reg.val[i]); });
  }

  float reduce_sum() const {
    float result = 0;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result += reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, this](int i) { result += reg.val[i]; });
    return result;
  }

  FP32Vec8 exp() const {
    f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = expf(reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, this](int i) { ret.val[i] = expf(reg.val[i]); });
    return FP32Vec8(ret);
  }

  FP32Vec8 tanh() const {
    f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = tanhf(reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, this](int i) { ret.val[i] = tanhf(reg.val[i]); });
    return FP32Vec8(ret);
  }

  FP32Vec8 er() const {
    f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = erf(reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, this](int i) { ret.val[i] = erf(reg.val[i]); });
    return FP32Vec8(ret);
  }

  FP32Vec8 operator*(const FP32Vec8& b) const {
    f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = reg.val[i] * b.reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] * b.reg.val[i]; });
    return FP32Vec8(ret);
  }

  FP32Vec8 operator+(const FP32Vec8& b) const {
    f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = reg.val[i] + b.reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] + b.reg.val[i]; });
    return FP32Vec8(ret);
  }

  FP32Vec8 operator-(const FP32Vec8& b) const {
    f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = reg.val[i] - b.reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] - b.reg.val[i]; });
    return FP32Vec8(ret);
  }

  FP32Vec8 operator/(const FP32Vec8& b) const {
    f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = reg.val[i] / b.reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] / b.reg.val[i]; });
    return FP32Vec8(ret);
  }

@@ -284,15 +262,11 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
  f32x16_t reg;

  explicit FP32Vec16(float v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = v;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([&v, this](int i) { reg.val[i] = v; });
  }

  explicit FP32Vec16() {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = 0.0f;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([this](int i) { reg.val[i] = 0.0f; });
  }

  explicit FP32Vec16(const float* ptr)
@@ -301,29 +275,27 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
  explicit FP32Vec16(f32x16_t data) : reg(data) {};

  FP32Vec16(const FP32Vec4& data) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+    unroll_loop<int, VEC_ELEM_NUM>([&data, this](int i) {
      reg.val[i] = data.reg.val[i % FP32Vec4::VEC_ELEM_NUM];
-    }
+    });
  }

  FP32Vec16(const FP32Vec8& data) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+    unroll_loop<int, VEC_ELEM_NUM>([&data, this](int i) {
      reg.val[i] = data.reg.val[i % FP32Vec8::VEC_ELEM_NUM];
-    }
+    });
  }

  FP32Vec16(const FP32Vec16& data) : reg(data.reg) {};

  explicit FP32Vec16(const FP16Vec16& v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = fp16_to_float(v.reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&v, this](int i) { reg.val[i] = fp16_to_float(v.reg.val[i]); });
  }

  explicit FP32Vec16(const BF16Vec16& v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = bf16_to_float(v.reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&v, this](int i) { reg.val[i] = bf16_to_float(v.reg.val[i]); });
  }

  explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
@@ -331,82 +303,74 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
  FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};

  FP32Vec16 operator*(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = reg.val[i] * b.reg.val[i];
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] * b.reg.val[i]; });
+    return FP32Vec16(ret);
  }

  FP32Vec16 operator+(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = reg.val[i] + b.reg.val[i];
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] + b.reg.val[i]; });
+    return FP32Vec16(ret);
  }

  FP32Vec16 operator-(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = reg.val[i] - b.reg.val[i];
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] - b.reg.val[i]; });
+    return FP32Vec16(ret);
  }

  FP32Vec16 operator/(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = reg.val[i] / b.reg.val[i];
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] / b.reg.val[i]; });
+    return FP32Vec16(ret);
  }

  FP32Vec16 max(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = __max(reg.val[i], b.reg.val[i]);
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>([&ret, &b, this](int i) {
+      ret.val[i] = std::max(reg.val[i], b.reg.val[i]);
+    });
+    return FP32Vec16(ret);
  }

  FP32Vec16 min(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = __min(reg.val[i], b.reg.val[i]);
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>([&ret, &b, this](int i) {
+      ret.val[i] = std::min(reg.val[i], b.reg.val[i]);
+    });
+    return FP32Vec16(ret);
  }

  FP32Vec16 abs() const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = __abs(reg.val[i]);
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, this](int i) { ret.val[i] = std::abs(reg.val[i]); });
+    return FP32Vec16(ret);
  }

  float reduce_sum() const {
    float result = 0.0f;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result += reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, this](int i) { result += reg.val[i]; });
    return result;
  }

  float reduce_max() const {
-    float result = reg.val[0];
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result = __max(reg.val[i], result);
-    }
+    float result = std::numeric_limits<float>::lowest();
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, this](int i) { result = std::max(reg.val[i], result); });
    return result;
  }

  float reduce_min() const {
-    float result = reg.val[0];
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result = __min(reg.val[i], result);
-    }
+    float result = std::numeric_limits<float>::max();
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, this](int i) { result = std::min(reg.val[i], result); });
    return result;
  }

@@ -414,13 +378,9 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
  float reduce_sub_sum(int idx) {
    static_assert(VEC_ELEM_NUM % group_size == 0);
    float sum = 0.0;
-    int start = idx * group_size;
-    int end = (idx + 1) * group_size;
-
-    for (; (start < VEC_ELEM_NUM) && (start < end); ++start) {
-      sum += reg.val[start];
-    }
-
+    const int start = idx * group_size;
+    unroll_loop<int, group_size>(
+        [&sum, &start, this](int i) { sum += reg.val[start + i]; });
    return sum;
  }

@@ -477,17 +437,13 @@ inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
 }

 inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
-  int i = 0;
-  for (i = 0; i < FP16Vec16::VEC_ELEM_NUM; ++i) {
-    reg.val[i] = float_to_fp16(v.reg.val[i]);
-  }
+  unroll_loop<int, FP16Vec16::VEC_ELEM_NUM>(
+      [&v, this](int i) { reg.val[i] = float_to_fp16(v.reg.val[i]); });
 }

 inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) {
-  int i = 0;
-  for (i = 0; i < FP16Vec8::VEC_ELEM_NUM; ++i) {
-    reg.val[i] = float_to_fp16(v.reg.val[i]);
-  }
+  unroll_loop<int, FP16Vec8::VEC_ELEM_NUM>(
+      [&v, this](int i) { reg.val[i] = float_to_fp16(v.reg.val[i]); });
 }

 inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
@@ -495,17 +451,13 @@ inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
 }

 inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
-  int i = 0;
-  for (i = 0; i < BF16Vec8::VEC_ELEM_NUM; ++i) {
-    reg.val[i] = float_to_bf16(v.reg.val[i]);
-  }
+  unroll_loop<int, BF16Vec8::VEC_ELEM_NUM>(
+      [&v, this](int i) { reg.val[i] = float_to_bf16(v.reg.val[i]); });
 }

 inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
-  int i = 0;
-  for (i = 0; i < BF16Vec16::VEC_ELEM_NUM; ++i) {
-    reg.val[i] = float_to_bf16(v.reg.val[i]);
-  }
+  unroll_loop<int, BF16Vec16::VEC_ELEM_NUM>(
+      [&v, this](int i) { reg.val[i] = float_to_bf16(v.reg.val[i]); });
 }

 inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 3); }
--- a/csrc/cpu/cpu_types_vxe.hpp
+++ b/csrc/cpu/cpu_types_vxe.hpp
@@ -4,6 +4,7 @@

 #include <vecintrin.h>
 #include <cmath>
+#include <limits>
 #include <torch/all.h>
 namespace vec_op {

@@ -174,8 +175,9 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
  }

  explicit FP32Vec8(const BF16Vec8& v) {
-    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
-    reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
+    // On big-endian s390x, place BF16 first to get correct byte order
+    reg.val[0] = (__vector float)vec_mergeh(v.reg, zero);
+    reg.val[1] = (__vector float)vec_mergel(v.reg, zero);
  }

  float reduce_sum() const {
@@ -189,51 +191,257 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
  }

  FP32Vec8 exp() const {
-    // TODO: Vectorize this
-    AliasReg ar;
-    ar.reg = reg;
-    f32x4x4_t ret;
-    ret.val[0][0] = std::exp(ar.values[0]);
-    ret.val[0][1] = std::exp(ar.values[1]);
-    ret.val[0][2] = std::exp(ar.values[2]);
-    ret.val[0][3] = std::exp(ar.values[3]);
-    ret.val[1][0] = std::exp(ar.values[4]);
-    ret.val[1][1] = std::exp(ar.values[5]);
-    ret.val[1][2] = std::exp(ar.values[6]);
-    ret.val[1][3] = std::exp(ar.values[7]);
-    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+    f32x4x2_t out;
+
+    const __vector float log2e = vec_splats(1.44269504088896341f);
+    const __vector float one = vec_splats(1.0f);
+    const __vector float min_x = vec_splats(-87.3f);
+    const __vector float max_x = vec_splats(88.7f);
+
+    // 5th-degree minimax polynomial for 2^r (r in [0,1))
+    const __vector float c1 = vec_splats(0.6931471805599453f);
+    const __vector float c2 = vec_splats(0.240226506959101f);
+    const __vector float c3 = vec_splats(0.05550410866482158f);
+    const __vector float c4 = vec_splats(0.009618129107628477f);
+    const __vector float c5 = vec_splats(0.0013333558146428443f);
+
+    for (int i = 0; i < 2; i++) {
+      __vector float x = reg.val[i];
+
+      x = vec_max(x, min_x);
+      x = vec_min(x, max_x);
+
+      __vector float y = vec_mul(x, log2e);
+
+      __vector float kf = vec_floor(y);
+      __vector float r = vec_sub(y, kf);
+
+      __vector signed int k = vec_signed(kf);
+      const __vector signed int min_k = vec_splats((signed int)-126);
+      const __vector signed int max_k = vec_splats((signed int)127);
+      k = vec_min(vec_max(k, min_k), max_k);
+
+      // Build 2^k from exponent bits
+      __vector signed int exp_int = vec_add(k, vec_splats((signed int)127));
+      __vector unsigned int bits = (__vector unsigned int)exp_int;
+      bits = vec_sl(bits, vec_splats((unsigned int)23));
+      __vector float pow2k = (__vector float)bits;
+
+      // Improved minimax polynomial
+      __vector float poly = vec_madd(c5, r, c4);
+      poly = vec_madd(poly, r, c3);
+      poly = vec_madd(poly, r, c2);
+      poly = vec_madd(poly, r, c1);
+      poly = vec_madd(poly, r, one);
+
+      out.val[i] = vec_mul(pow2k, poly);
+    }
+
+    return FP32Vec8(out);
  }

  FP32Vec8 tanh() const {
-    // TODO: Vectorize this
-    AliasReg ar;
-    ar.reg = reg;
-    f32x4x4_t ret;
-    ret.val[0][0] = std::tanh(ar.values[0]);
-    ret.val[0][1] = std::tanh(ar.values[1]);
-    ret.val[0][2] = std::tanh(ar.values[2]);
-    ret.val[0][3] = std::tanh(ar.values[3]);
-    ret.val[1][0] = std::tanh(ar.values[4]);
-    ret.val[1][1] = std::tanh(ar.values[5]);
-    ret.val[1][2] = std::tanh(ar.values[6]);
-    ret.val[1][3] = std::tanh(ar.values[7]);
-    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+    // tanh(x) = (exp(2x) - 1) / (exp(2x) + 1)
+    const __vector float one = vec_splats(1.0f);
+    const __vector float two = vec_splats(2.0f);
+    const __vector float zero = vec_splats(0.0f);
+    const __vector float sat =
+        vec_splats(9.0f);  // beyond this, tanh(x) ~ sign(x)
+
+    f32x4x2_t out;
+
+    for (int i = 0; i < 2; i++) {
+      __vector float x = reg.val[i];
+      __vector float ax = vec_abs(x);
+
+      // sign(x): +1 or -1
+      __vector float sign = vec_sel(vec_splats(-1.0f), one, vec_cmpgt(x, zero));
+
+      // saturation mask: |x| > sat
+      __vector __bool int saturated = vec_cmpgt(ax, sat);
+
+      // 2x
+      __vector float two_x = vec_mul(x, two);
+
+      // Build a temporary FP32Vec8 with both lanes = 2x, reuse exp()
+      f32x4x2_t tmp;
+      tmp.val[0] = two_x;
+      tmp.val[1] = two_x;
+      FP32Vec8 exp_2x_vec(tmp);
+
+      FP32Vec8 e2x = exp_2x_vec.exp();
+      __vector float e = e2x.reg.val[i];
+
+      // tanh(x) = (e - 1) / (e + 1)
+      __vector float num = vec_sub(e, one);
+      __vector float den = vec_add(e, one);
+
+      __vector float t = vec_div(num, den);
+
+      // For large |x|, clamp to sign(x)
+      out.val[i] = vec_sel(t, sign, saturated);
+    }
+
+    return FP32Vec8(out);
  }

  FP32Vec8 er() const {
-    // TODO: Vectorize this
-    AliasReg ar;
-    ar.reg = reg;
-    f32x4x4_t ret;
-    ret.val[0][0] = std::erf(ar.values[0]);
-    ret.val[0][1] = std::erf(ar.values[1]);
-    ret.val[0][2] = std::erf(ar.values[2]);
-    ret.val[0][3] = std::erf(ar.values[3]);
-    ret.val[1][0] = std::erf(ar.values[4]);
-    ret.val[1][1] = std::erf(ar.values[5]);
-    ret.val[1][2] = std::erf(ar.values[6]);
-    ret.val[1][3] = std::erf(ar.values[7]);
-    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+    // A&S 7.1.26 approximation:
+    // erf(x) = sign(x) * (1 - ((((a5*t + a4)*t + a3)*t + a2)*t + a1) * t *
+    // exp(-x^2)) t = 1 / (1 + p*|x|),  p = 0.3275911
+
+    const __vector float one = vec_splats(1.0f);
+    const __vector float zero = vec_splats(0.0f);
+    const __vector float p = vec_splats(0.3275911f);
+
+    // Polynomial coeffs
+    const __vector float a1 = vec_splats(0.254829592f);
+    const __vector float a2 = vec_splats(-0.284496736f);
+    const __vector float a3 = vec_splats(1.421413741f);
+    const __vector float a4 = vec_splats(-1.453152027f);
+    const __vector float a5 = vec_splats(1.061405429f);
+
+    // Threshold where erf(x) ~ sign(x)
+    const __vector float sat = vec_splats(6.0f);
+
+    f32x4x2_t out;
+
+    for (int lane = 0; lane < 2; lane++) {
+      __vector float x = reg.val[lane];
+      __vector float ax = vec_abs(x);
+
+      // sign(x)
+      __vector float sign = vec_sel(vec_splats(-1.0f), one, vec_cmpgt(x, zero));
+
+      // |x| > 6 → erf(x) = ±1
+      __vector __bool int saturated = vec_cmpgt(ax, sat);
+
+      // t = 1 / (1 + p * |x|)
+      __vector float t = vec_madd(p, ax, one);
+      t = vec_div(one, t);
+
+      // poly = a5
+      __vector float poly = a5;
+      poly = vec_madd(poly, t, a4);
+      poly = vec_madd(poly, t, a3);
+      poly = vec_madd(poly, t, a2);
+      poly = vec_madd(poly, t, a1);
+
+      // full polynomial: poly = poly * t
+      poly = vec_mul(poly, t);
+
+      // Compute exp(-x^2)
+      __vector float x2 = vec_mul(x, x);
+      __vector float neg_x2 = vec_neg(x2);
+
+      f32x4x2_t tmp;
+      tmp.val[0] = neg_x2;
+      tmp.val[1] = neg_x2;
+      FP32Vec8 exp_neg_x2(tmp);
+
+      FP32Vec8 e = exp_neg_x2.exp();
+      __vector float ex = e.reg.val[lane];
+
+      // erf(x) = sign * (1 - poly * exp(-x^2))
+      __vector float term = vec_mul(poly, ex);
+      __vector float y = vec_sub(one, term);
+      y = vec_mul(y, sign);
+
+      // saturated → ±1
+      __vector float sat_val = vec_mul(sign, one);
+      out.val[lane] = vec_sel(y, sat_val, saturated);
+    }
+
+    return FP32Vec8(out);
+  }
+  // Elementwise sigmoid(x) = 1 / (1 + exp(-x))
+  FP32Vec8 sigmoid() const {
+    const __vector float one = vec_splats(1.0f);
+
+    f32x4x2_t neg;
+    for (int i = 0; i < 2; ++i) {
+      neg.val[i] = vec_neg(reg.val[i]);
+    }
+
+    FP32Vec8 neg_x(neg);
+    FP32Vec8 e = neg_x.exp();  // exp(-x)
+
+    f32x4x2_t denom;
+    for (int i = 0; i < 2; ++i) {
+      denom.val[i] = vec_add(one, e.reg.val[i]);
+    }
+
+    FP32Vec8 denom_vec(denom);
+    FP32Vec8 one_vec(1.0f);
+
+    return one_vec / denom_vec;
+  }
+
+  // Tanh-based GELU:
+  // gelu(x) = 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x^3)))
+  FP32Vec8 gelu_tanh() const {
+    const __vector float k_s2pi = vec_splats(0.7978845608028654f);  // √(2/π)
+    const __vector float k_0_0447 = vec_splats(0.044715f);
+
+    f32x4x2_t x2, x3, inner;
+    for (int i = 0; i < 2; ++i) {
+      __vector float x = reg.val[i];
+      x2.val[i] = vec_mul(x, x);                            // x^2
+      x3.val[i] = vec_mul(x2.val[i], x);                    // x^3
+      __vector float t = vec_madd(k_0_0447, x3.val[i], x);  // x + 0.044715*x^3
+      inner.val[i] = vec_mul(k_s2pi, t);                    // √(2/π)*(...)
+    }
+
+    FP32Vec8 inner_vec(inner);
+    FP32Vec8 t = inner_vec.tanh();  // tanh part
+
+    FP32Vec8 one_vec(1.0f);
+    FP32Vec8 half_vec(0.5f);
+
+    FP32Vec8 x_vec(*this);
+    return x_vec * half_vec * (one_vec + t);
+  }
+
+  // Erf-based GELU:
+  // gelu(x) = 0.5 * x * (1 + erf(x / √2))
+  FP32Vec8 gelu_erf() const {
+    const __vector float inv_sqrt2 = vec_splats(0.7071067811865476f);  // 1/√2
+    FP32Vec8 x_vec(*this);
+
+    f32x4x2_t scaled;
+    for (int i = 0; i < 2; ++i) {
+      scaled.val[i] = vec_mul(reg.val[i], inv_sqrt2);
+    }
+    FP32Vec8 x_scaled(scaled);
+
+    FP32Vec8 erf_x = x_scaled.er();
+
+    FP32Vec8 one_vec(1.0f);
+    FP32Vec8 half_vec(0.5f);
+
+    return x_vec * half_vec * (one_vec + erf_x);
+  }
+
+  // Elementwise reciprocal: 1/x (scalar per lane, for correctness)
+  FP32Vec8 rcp() const {
+    AliasReg in, out;
+    in.reg = reg;
+
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      out.values[i] = 1.0f / in.values[i];
+    }
+    return FP32Vec8(out.reg);
+  }
+
+  // Elementwise rsqrt(x) = 1 / sqrt(x) (scalar per lane, for correctness)
+  FP32Vec8 rsqrt() const {
+    AliasReg in, out;
+    in.reg = reg;
+
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      out.values[i] = 1.0f / std::sqrt(in.values[i]);
+    }
+    return FP32Vec8(out.reg);
  }

  FP32Vec8 operator*(const FP32Vec8& b) const {
@@ -316,10 +524,11 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
  }

  explicit FP32Vec16(const BF16Vec16& v) {
-    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
-    reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
-    reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
-    reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
+    // On big-endian s390x, place BF16 first to get correct byte order
+    reg.val[0] = (__vector float)vec_mergeh(v.reg.val[0], zero);
+    reg.val[1] = (__vector float)vec_mergel(v.reg.val[0], zero);
+    reg.val[2] = (__vector float)vec_mergeh(v.reg.val[1], zero);
+    reg.val[3] = (__vector float)vec_mergel(v.reg.val[1], zero);
  }

  explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
@@ -376,6 +585,23 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
    return result;
  }

+  FP32Vec16 max(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_max(reg.val[0], b.reg.val[0]),
+                                vec_max(reg.val[1], b.reg.val[1]),
+                                vec_max(reg.val[2], b.reg.val[2]),
+                                vec_max(reg.val[3], b.reg.val[3])}));
+  }
+
+  float reduce_max() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = ar.values[0];
+    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) {
+      if (ar.values[i] > result) result = ar.values[i];
+    });
+    return result;
+  }
+
  void save(float* ptr) const {
    vec_xst(reg.val[0], 0, ptr);
    vec_xst(reg.val[1], 16, ptr);
@@ -402,15 +628,14 @@ struct VecType<c10::BFloat16> {
  using vec_type = BF16Vec8;
 };

+// On s390x, FP16 (Half) is not natively supported, use FP32 vectors instead
+using FP16Vec16 = FP32Vec16;
+
 template <typename T>
 void storeFP32(float v, T* ptr) {
  *ptr = v;
 }

-inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
-  acc = acc + a * b;
-}
-
 namespace c10 {
 struct BFloat16 {
  uint16_t value;  // Assume BFloat16 is defined as a struct containing a 16-bit
@@ -429,6 +654,79 @@ inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
  #define __VEC_CLASS_FP_NAN (1 << 6)
 #endif

+// Optimized FMA (Fused Multiply-Add) implementations using IBM Z vector
+// intrinsics
+
+// FP32Vec4 FMA: acc = acc + (a * b) or equivalently acc = fma(a, b, acc)
+FORCE_INLINE void fma(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+  acc.reg = vec_madd(a.reg, b.reg, acc.reg);
+}
+
+// FP32Vec8 FMA: acc = acc + (a * b)
+FORCE_INLINE void fma(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+  acc.reg.val[0] = vec_madd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_madd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+// FP32Vec16 FMA: acc = acc + (a * b)
+FORCE_INLINE void fma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+  acc.reg.val[0] = vec_madd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_madd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+  acc.reg.val[2] = vec_madd(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+  acc.reg.val[3] = vec_madd(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
+// Multiply-Subtract: acc = acc - (a * b)
+FORCE_INLINE void fms(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+  acc.reg = vec_msub(a.reg, b.reg, acc.reg);
+}
+
+FORCE_INLINE void fms(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+  acc.reg.val[0] = vec_msub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_msub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+FORCE_INLINE void fms(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+  acc.reg.val[0] = vec_msub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_msub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+  acc.reg.val[2] = vec_msub(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+  acc.reg.val[3] = vec_msub(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
+// Negative Multiply-Add: acc = -(a * b) + acc
+FORCE_INLINE void nfma(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+  acc.reg = vec_nmadd(a.reg, b.reg, acc.reg);
+}
+
+FORCE_INLINE void nfma(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+  acc.reg.val[0] = vec_nmadd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_nmadd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+FORCE_INLINE void nfma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+  acc.reg.val[0] = vec_nmadd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_nmadd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+  acc.reg.val[2] = vec_nmadd(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+  acc.reg.val[3] = vec_nmadd(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
+// Negative Multiply-Subtract: acc = -(a * b) - acc
+FORCE_INLINE void nfms(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+  acc.reg = vec_nmsub(a.reg, b.reg, acc.reg);
+}
+
+FORCE_INLINE void nfms(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+  acc.reg.val[0] = vec_nmsub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_nmsub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+FORCE_INLINE void nfms(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+  acc.reg.val[0] = vec_nmsub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_nmsub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+  acc.reg.val[2] = vec_nmsub(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+  acc.reg.val[3] = vec_nmsub(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
 const static __vector unsigned char omask = {2,  3,  6,  7,  10, 11, 14, 15,
                                             18, 19, 22, 23, 26, 27, 30, 31};
 const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff,
@@ -441,13 +739,24 @@ const static __vector unsigned int one = {1, 1, 1, 1};
 inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
  __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
  __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
+  __vector unsigned int lsb0 = inp0 >> sh16;
+  __vector unsigned int lsb1 = inp1 >> sh16;
+  lsb0 = lsb0 & one;
+  lsb1 = lsb1 & one;
+  __vector unsigned int rnd0 = lsb0 + bias;
+  __vector unsigned int rnd1 = lsb1 + bias;
+  inp0 = inp0 + rnd0;
+  inp1 = inp1 + rnd1;
  int cc;
  __vector __bool int sel0 =
      vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
  __vector __bool int sel1 =
      vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc);
-  inp0 = vec_sel(inp0, nan, sel0) >> sh16;
-  inp1 = vec_sel(inp1, nan, sel1) >> sh16;
+  inp0 = vec_sel(inp0, nan, sel0);
+  inp1 = vec_sel(inp1, nan, sel1);
+  inp0 = inp0 >> sh16;
+  inp1 = inp1 >> sh16;
+
  reg = (__vector signed short)vec_perm(inp0, inp1, omask);
 }

@@ -456,6 +765,22 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
  __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
  __vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
  __vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
+  __vector unsigned int lsb0 = inp0 >> sh16;
+  __vector unsigned int lsb1 = inp1 >> sh16;
+  __vector unsigned int lsb2 = inp2 >> sh16;
+  __vector unsigned int lsb3 = inp3 >> sh16;
+  lsb0 = lsb0 & one;
+  lsb1 = lsb1 & one;
+  lsb2 = lsb2 & one;
+  lsb3 = lsb3 & one;
+  __vector unsigned int rnd0 = lsb0 + bias;
+  __vector unsigned int rnd1 = lsb1 + bias;
+  __vector unsigned int rnd2 = lsb2 + bias;
+  __vector unsigned int rnd3 = lsb3 + bias;
+  inp0 = inp0 + rnd0;
+  inp1 = inp1 + rnd1;
+  inp2 = inp2 + rnd2;
+  inp3 = inp3 + rnd3;
  int cc;
  __vector __bool int sel0 =
      vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
@@ -465,15 +790,164 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
      vec_fp_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN, &cc);
  __vector __bool int sel3 =
      vec_fp_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN, &cc);
-  inp0 = vec_sel(inp0, nan, sel0) >> sh16;
-  inp1 = vec_sel(inp1, nan, sel1) >> sh16;
-  inp2 = vec_sel(inp2, nan, sel2) >> sh16;
-  inp3 = vec_sel(inp3, nan, sel3) >> sh16;
+  inp0 = vec_sel(inp0, nan, sel0);
+  inp1 = vec_sel(inp1, nan, sel1);
+  inp2 = vec_sel(inp2, nan, sel2);
+  inp3 = vec_sel(inp3, nan, sel3);
+  inp0 = inp0 >> sh16;
+  inp1 = inp1 >> sh16;
+  inp2 = inp2 >> sh16;
+  inp3 = inp3 >> sh16;
+
  reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
  reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
 }

-inline void prefetch(const void* addr) { void __dcbt(const void* addr); }
+// 1D softmax over `n` elements in `input`, writes result to `output`.
+// Uses FP32Vec8 for main body, scalar tail handling.
+// Requirement: n > 0
+FORCE_INLINE void softmax_fp32vec8(float* output, const float* input, int n) {
+  if (n <= 0) return;
+
+  // ---------- Pass 1: find max ----------
+  float max_val = -std::numeric_limits<float>::infinity();
+  int i = 0;
+
+  for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+    FP32Vec8 v(input + i);
+    FP32Vec8::AliasReg ar;
+    ar.reg = v.reg;
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      if (ar.values[j] > max_val) max_val = ar.values[j];
+    }
+  }
+  for (; i < n; ++i) {
+    if (input[i] > max_val) max_val = input[i];
+  }
+
+  // ---------- Pass 2: compute exp(x - max) and sum ----------
+  float sum = 0.0f;
+  i = 0;
+
+  for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+    float tmp[FP32Vec8::VEC_ELEM_NUM];
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      tmp[j] = input[i + j] - max_val;
+    }
+
+    FP32Vec8 v(tmp);
+    FP32Vec8 e = v.exp();
+
+    FP32Vec8::AliasReg ar;
+    ar.reg = e.reg;
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      output[i + j] = ar.values[j];
+      sum += ar.values[j];
+    }
+  }
+
+  // Tail
+  for (; i < n; ++i) {
+    float x = input[i] - max_val;
+    float ex = std::exp(x);  // scalar tail
+    output[i] = ex;
+    sum += ex;
+  }
+
+  // ---------- Pass 3: normalize ----------
+  float inv_sum = 1.0f / sum;
+  i = 0;
+
+  for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+    float tmp[FP32Vec8::VEC_ELEM_NUM];
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      tmp[j] = output[i + j] * inv_sum;
+    }
+    FP32Vec8 v(tmp);
+    v.save(output + i);
+  }
+
+  for (; i < n; ++i) {
+    output[i] *= inv_sum;
+  }
+}
+
+// 1D RMSNorm kernel:
+//   input:  x[0..n-1]
+//   weight: w[0..n-1] (gamma), may be nullptr
+//   output: y[i] = x[i] * inv_rms * (weight[i] if weight != nullptr else 1)
+//   eps: small epsilon for numerical stability
+FORCE_INLINE void rmsnorm_fp32vec8(float* output, const float* input,
+                                   const float* weight, int n, float eps) {
+  if (n <= 0) return;
+
+  // ---------- Pass 1: compute sum of squares ----------
+  float sum_sq = 0.0f;
+  int i = 0;
+
+  for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+    FP32Vec8 x_vec(input + i);
+
+    FP32Vec8 sq = x_vec * x_vec;
+
+    FP32Vec8::AliasReg ar;
+    ar.reg = sq.reg;
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      sum_sq += ar.values[j];
+    }
+  }
+
+  // Tail
+  for (; i < n; ++i) {
+    float v = input[i];
+    sum_sq += v * v;
+  }
+
+  float mean_sq = sum_sq / static_cast<float>(n);
+  float inv_rms = 1.0f / std::sqrt(mean_sq + eps);
+
+  // ---------- Pass 2: scale (and apply weight if given) ----------
+  const float inv_rms_f = inv_rms;
+  i = 0;
+
+  if (weight) {
+    // with gamma
+    for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+      FP32Vec8 x_vec(input + i);
+
+      float wtmp[FP32Vec8::VEC_ELEM_NUM];
+      for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+        wtmp[j] = weight[i + j];
+      }
+      FP32Vec8 w_vec(wtmp);
+
+      FP32Vec8 scale_vec(inv_rms_f);
+      FP32Vec8 y = x_vec * scale_vec * w_vec;
+      y.save(output + i);
+    }
+
+    for (; i < n; ++i) {
+      output[i] = input[i] * inv_rms_f * weight[i];
+    }
+  } else {
+    // without gamma
+    for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+      FP32Vec8 x_vec(input + i);
+      FP32Vec8 scale_vec(inv_rms_f);
+      FP32Vec8 y = x_vec * scale_vec;
+      y.save(output + i);
+    }
+
+    for (; i < n; ++i) {
+      output[i] = input[i] * inv_rms_f;
+    }
+  }
+}
+
+// Prefetch data to cache for better memory access performance
+FORCE_INLINE void prefetch(const void* addr) {
+  __builtin_prefetch(addr, 0, 3);  // 0=read, 3=high temporal locality
+}

 };  // namespace vec_op

--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -40,6 +40,23 @@ namespace vec_op {

 #define FORCE_INLINE __attribute__((always_inline)) inline

+// Function to get the timestamp using RDTSCP
+FORCE_INLINE uint64_t bench_timestamp() {
+  unsigned int cycles_low, cycles_high;
+  asm volatile(
+      ".intel_syntax noprefix\n\t"
+      "CPUID\n\t"        // Serialize instruction stream to ensure previous
+                         // instructions complete
+      "RDTSCP\n\t"       // Read TSC and core ID
+      "mov %0, edx\n\t"  // Store high 32 bits of TSC
+      "mov %1, eax\n\t"  // Store low 32 bits of TSC
+      ".att_syntax"
+      : "=r"(cycles_high), "=r"(cycles_low)::"rax", "rbx", "rcx",
+        "rdx"  // Clobbered registers
+  );
+  return (uint64_t)cycles_high << 32 | cycles_low;
+}
+
 namespace {
 template <typename T, T... indexes, typename F>
 constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
@@ -87,6 +104,8 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
  explicit FP16Vec16(bool, void* ptr)
      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}

+  explicit FP16Vec16(const c10::Half v) : reg(_mm256_set1_epi16(v.x)) {}
+
  explicit FP16Vec16(const FP32Vec16&);

  void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); }
@@ -124,6 +143,8 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
  explicit BF16Vec16(bool, void* ptr)
      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}

+  explicit BF16Vec16(const c10::BFloat16 v) : reg(_mm256_set1_epi16(v.x)) {}
+
  explicit BF16Vec16(const FP32Vec16&);

  void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); }
@@ -333,6 +354,22 @@ struct FP32Vec16 : public Vec<FP32Vec16> {

  explicit FP32Vec16(__m512 data) : reg(data) {}

+  // de-pack 4 bit values
+  explicit FP32Vec16(int64_t value, const FP32Vec16& lut) {
+    int64_t mask_0 = 0x0F0F0F0F0F0F0F0F;
+    int64_t mask_1 = 0xF0F0F0F0F0F0F0F0;
+    int64_t value_0 = value & mask_0;
+    int64_t value_1 = value & mask_1;
+    __m128i vec_0 = _mm_movpi64_epi64((__m64)value_0);
+    __m128i vec_1 = _mm_movpi64_epi64((__m64)value_1);
+    vec_0 = _mm_cvtepu8_epi16(vec_0);
+    vec_1 = _mm_cvtepu8_epi16(vec_1);
+    vec_1 = _mm_slli_epi16(vec_1, 4);
+    __m128i vec = _mm_or_si128(vec_0, vec_1);
+    __m512i vec_i32 = _mm512_cvtepu8_epi32(vec);
+    reg = _mm512_permutexvar_ps(vec_i32, lut.reg);
+  }
+
  explicit FP32Vec16(const FP32Vec4& data)
      : reg((__m512)_mm512_inserti32x4(
            _mm512_inserti32x4(
@@ -407,13 +444,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {

  float reduce_min() const { return _mm512_reduce_min_ps(reg); }

-  template <int group_size>
-  float reduce_sub_sum(int idx) {
-    static_assert(VEC_ELEM_NUM % group_size == 0);
-    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
-    __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
-    return _mm512_mask_reduce_add_ps(mask, reg);
-  }
+  float get_last_elem() const { return _mm512_cvtss_f32(reg); }

  void save(float* ptr) const { _mm512_storeu_ps(ptr, reg); }

@@ -446,9 +477,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {

  explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {}

-  explicit FP32Vec16(const FP32Vec16& data)
-      : reg_low(data.reg_low), reg_high(data.reg_high) {}
-
  explicit FP32Vec16(const FP32Vec4& data)
      : reg_low((__m256)_mm256_inserti128_si256(
            _mm256_castsi128_si256((__m128i)data.reg), (__m128i)data.reg, 1)),
@@ -504,6 +532,32 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
                     _mm256_div_ps(reg_high, b.reg_high));
  }

+  FP32Vec16 max(const FP32Vec16& b) const {
+    return FP32Vec16(_mm256_max_ps(reg_low, b.reg_low),
+                     _mm256_max_ps(reg_high, b.reg_high));
+  }
+
+  float reduce_max() const {
+    __m256 v = _mm256_max_ps(reg_low, reg_high);
+    // Permute to compare elements within 128-bit lanes
+    __m256 v_shuffled = _mm256_permute_ps(
+        v, 0b00001011);  // Swap halves within each 128-bit lane
+    __m256 v_max = _mm256_max_ps(v, v_shuffled);
+
+    v_shuffled = _mm256_permute_ps(
+        v_max, 0b00000001);  // Shuffle elements within each 128-bit lane
+    v_max = _mm256_max_ps(v_max, v_shuffled);
+
+    // Permute to compare elements between 128-bit lanes
+    v_shuffled =
+        _mm256_permute2f128_ps(v_max, v_max, 0b00000001);  // Swap 128-bit lanes
+    v_max = _mm256_max_ps(v_max, v_shuffled);
+
+    // At this point, the maximum value is present in all elements of v_max.
+    // Extract the first element for the scalar result.
+    return _mm256_cvtss_f32(v_max);  // Extract the lowest 32-bit float
+  }
+
  float reduce_sum() const {
    FP32Vec8 low = FP32Vec8(reg_low);
    FP32Vec8 high = FP32Vec8(reg_high);
@@ -642,7 +696,7 @@ inline FP16Vec16::FP16Vec16(const FP32Vec16& v)
 inline FP16Vec16::FP16Vec16(const FP32Vec16& v)
    : reg(_mm256_insertf128_si256(
          _mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg),
-          FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {}
+          FP16Vec8(FP32Vec8(v.reg_high)).reg, 1)) {}
 #endif

 #ifdef __AVX512BF16__
@@ -713,6 +767,25 @@ inline void non_temporal_save(BF16Vec16& vec, void* ptr) {
 inline void non_temporal_save(FP32Vec16& vec, void* ptr) {
  _mm512_stream_ps((float*)ptr, vec.reg);
 }
+
+static void interleave_save(const BF16Vec16& vec0, const BF16Vec16& vec1,
+                            void* ptr) {
+  __m512i vec_0 = _mm512_cvtepu16_epi32(vec0.reg);
+  __m512i vec_1 = _mm512_cvtepu16_epi32(vec1.reg);
+  vec_1 = _mm512_slli_epi32(vec_1, 16);
+  vec_0 = _mm512_or_si512(vec_0, vec_1);
+  _mm512_storeu_epi32(ptr, vec_0);
+}
+
+static void interleave_save(const FP16Vec16& vec0, const FP16Vec16& vec1,
+                            void* ptr) {
+  __m512i vec_0 = _mm512_cvtepu16_epi32(vec0.reg);
+  __m512i vec_1 = _mm512_cvtepu16_epi32(vec1.reg);
+  vec_1 = _mm512_slli_epi32(vec_1, 16);
+  vec_0 = _mm512_or_si512(vec_0, vec_1);
+  _mm512_storeu_epi32(ptr, vec_0);
+}
+
 #endif

 inline void mem_barrier() { _mm_mfence(); }
--- a/csrc/cpu/cpu_wna16.cpp
+++ b/csrc/cpu/cpu_wna16.cpp
@@ -0,0 +1,402 @@
+#include "cpu_types.hpp"
+#include "scratchpad_manager.h"
+#include "utils.hpp"
+
+#ifdef CPU_CAPABILITY_AMXBF16
+  #include "cpu/micro_gemm/cpu_micro_gemm_amx.hpp"
+#endif
+#include "cpu/micro_gemm/cpu_micro_gemm_vec.hpp"
+
+#define VLLM_DISPATCH_CASE_16B_TYPES(...)                 \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
+
+#define VLLM_DISPATCH_16B_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_16B_TYPES(__VA_ARGS__))
+
+template <typename T>
+void print_logits(const char* name, T* ptr, int32_t row, int32_t col,
+                  int32_t stride) {
+  std::stringstream ss;
+  ss << std::fixed << std::setprecision(5) << name << ": [\n";
+  auto* curr_logits_buffer = ptr;
+  for (int32_t m = 0; m < row; ++m) {
+    for (int32_t n = 0; n < col; ++n) {
+      ss << curr_logits_buffer[n] << ", ";
+    }
+    ss << "\n";
+    curr_logits_buffer += stride;
+  }
+  ss << "]\n";
+  std::printf("%s", ss.str().c_str());
+}
+
+namespace {
+using cpu_utils::ISA;
+using cpu_utils::VecTypeTrait;
+
+template <typename scalar_t, ISA isa, bool has_zp, bool use_desc_act>
+class Dequantizer4b {
+ public:
+  constexpr static int32_t pack_num = 32 / 4;
+  using scalar_vec_t = typename VecTypeTrait<scalar_t>::vec_t;
+
+ public:
+  static void dequant(int32_t* __restrict__ q_weight,
+                      scalar_t* __restrict__ weight,
+                      scalar_t* __restrict__ scales,
+                      int32_t* __restrict__ zeros, int32_t* __restrict__ g_idx,
+                      const int64_t scales_stride, const int64_t zeros_stride,
+                      const int32_t k_size, const int32_t group_size) {
+    vec_op::FP32Vec16 lut;
+    if constexpr (has_zp) {
+      // AWQ
+      alignas(64) static const float LUT[16] = {
+          0.0f, 1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,
+          8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
+      lut = vec_op::FP32Vec16(LUT);
+    } else {
+      // GPTQ
+      alignas(64) static const float LUT[16] = {
+          -8.0f, -7.0f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f,
+          0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f};
+      lut = vec_op::FP32Vec16(LUT);
+    }
+
+    // per 64-bits elem contains 16 output channels
+    int64_t* __restrict__ curr_q_weight = reinterpret_cast<int64_t*>(q_weight);
+    int64_t* __restrict__ curr_zeros = reinterpret_cast<int64_t*>(zeros);
+    scalar_t* __restrict__ curr_weight = weight;
+    scalar_t* __restrict__ curr_scale = scales;
+    vec_op::FP32Vec16 scale_0;
+    vec_op::FP32Vec16 scale_1;
+    vec_op::FP32Vec16 zero_0;
+    vec_op::FP32Vec16 zero_1;
+    int32_t group_counter = 0;
+    for (int32_t k_idx = 0; k_idx < k_size; k_idx += 2) {
+      int64_t qwb_0 = *curr_q_weight;
+      int64_t qwb_1 = *(curr_q_weight + 1);
+      vec_op::FP32Vec16 wb_0(qwb_0, lut);
+      vec_op::FP32Vec16 wb_1(qwb_1, lut);
+
+      if constexpr (!use_desc_act) {
+        if (group_counter == 0) {
+          scale_0 = vec_op::FP32Vec16(scalar_vec_t(curr_scale));
+          scale_1 = vec_op::FP32Vec16(scale_0);
+          curr_scale += scales_stride;
+
+          if constexpr (has_zp) {
+            zero_0 = vec_op::FP32Vec16(*curr_zeros, lut);
+            zero_1 = vec_op::FP32Vec16(zero_0);
+            curr_zeros += zeros_stride / 2;
+          }
+        }
+      } else {
+        int32_t g_idx_0 = g_idx[k_idx];
+        int32_t g_idx_1 = g_idx[k_idx + 1];
+        scale_0 = vec_op::FP32Vec16(
+            scalar_vec_t(curr_scale + g_idx_0 * scales_stride));
+        scale_1 = vec_op::FP32Vec16(
+            scalar_vec_t(curr_scale + g_idx_1 * scales_stride));
+        if constexpr (has_zp) {
+          zero_0 = vec_op::FP32Vec16(*(curr_zeros + g_idx_0 * zeros_stride / 2),
+                                     lut);
+          zero_1 = vec_op::FP32Vec16(*(curr_zeros + g_idx_1 * zeros_stride / 2),
+                                     lut);
+        }
+      }
+
+      if constexpr (has_zp) {
+        wb_0 = wb_0 - zero_0;
+        wb_1 = wb_1 - zero_1;
+      }
+
+      wb_0 = wb_0 * scale_0;
+      wb_1 = wb_1 * scale_1;
+
+      scalar_vec_t output_vec_0(wb_0);
+      scalar_vec_t output_vec_1(wb_1);
+
+      // AMX needs to interlave K elements to pack as 32 bits
+      if constexpr (isa == ISA::AMX) {
+        vec_op::interleave_save(output_vec_0, output_vec_1, curr_weight);
+      } else {
+        output_vec_0.save(curr_weight);
+        output_vec_1.save(curr_weight + 16);
+      }
+
+      // update
+      curr_q_weight += 2;
+      curr_weight += 32;
+      if constexpr (!use_desc_act) {
+        group_counter += 2;
+        if (group_counter == group_size) {
+          group_counter = 0;
+        }
+      }
+    }
+  }
+};
+};  // namespace
+
+template <typename scalar_t, typename dequantizer_t, typename gemm_t>
+void cpu_gemm_wna16_impl(
+    scalar_t* __restrict__ input, int32_t* __restrict__ q_weight,
+    scalar_t* __restrict__ output, scalar_t* __restrict__ scales,
+    int32_t* __restrict__ zeros, int32_t* __restrict__ g_idx,
+    scalar_t* __restrict__ bias, const int32_t m_size, const int32_t n_size,
+    const int32_t k_size, const int64_t input_stride,
+    const int64_t output_stride, const int64_t scales_group_stride,
+    const int64_t zeros_group_stride, const int32_t group_num,
+    const int32_t group_size, const int64_t pack_factor) {
+  constexpr int32_t gemm_n_tile_size = gemm_t::NSize;
+  constexpr int32_t gemm_m_tile_size = gemm_t::MaxMSize;
+  constexpr int32_t n_block_size = 16;
+  static_assert(gemm_n_tile_size % n_block_size == 0);
+  const int32_t thread_num = omp_get_max_threads();
+
+  // a simple schedule policy, just to hold more B tiles in L2 and make sure
+  // each thread has tasks
+  const int32_t n_partition_size = [&]() {
+    const int64_t cache_size = cpu_utils::get_l2_size();
+    int64_t ps_cache_limit = cache_size / (k_size * sizeof(scalar_t));
+    int64_t ps_thread_limit = n_size / thread_num;
+    ps_cache_limit =
+        std::max((ps_cache_limit / gemm_n_tile_size) * gemm_n_tile_size,
+                 (int64_t)gemm_n_tile_size);
+    ps_thread_limit =
+        std::max((ps_thread_limit / gemm_n_tile_size) * gemm_n_tile_size,
+                 (int64_t)gemm_n_tile_size);
+    return std::min(ps_cache_limit, ps_thread_limit);
+  }();
+  const int32_t task_num = (n_size + n_partition_size - 1) / n_partition_size;
+
+  // get buffer size
+  const int64_t b_buffer_size =
+      (((n_partition_size * k_size * sizeof(scalar_t) + 63) / 64) * 64);
+  const int64_t c_buffer_size =
+      (((gemm_m_tile_size * gemm_n_tile_size * sizeof(float) + 63) / 64) * 64);
+  const int64_t b_buffer_offset = 0;
+  const int64_t c_buffer_offset = b_buffer_size;
+  const int64_t buffer_size = b_buffer_size + c_buffer_size;
+  DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc(buffer_size *
+                                                                thread_num);
+
+  alignas(64) cpu_utils::Counter counter;
+  cpu_utils::Counter* counter_ptr = &counter;
+
+#pragma omp parallel for schedule(static, 1)
+  for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) {
+    scalar_t* __restrict__ b_buffer = nullptr;
+    float* __restrict__ c_buffer = nullptr;
+    {
+      uint8_t* buffer_ptr = DNNLScratchPadManager::get_dnnl_scratchpad_manager()
+                                ->get_data<uint8_t>() +
+                            thread_id * buffer_size;
+      b_buffer = reinterpret_cast<scalar_t*>(buffer_ptr + b_buffer_offset);
+      c_buffer = reinterpret_cast<float*>(buffer_ptr + c_buffer_offset);
+    }
+
+    const int64_t q_weight_block_stride = n_block_size / pack_factor * k_size;
+    const int64_t b_buffer_block_stride = n_block_size * k_size;
+    const int32_t zeros_block_stride = n_block_size / pack_factor;
+
+    gemm_t gemm;
+
+    for (;;) {
+      int32_t task_id = counter_ptr->acquire_counter();
+
+      if (task_id >= task_num) {
+        break;
+      }
+
+      const int32_t n_start_idx = task_id * n_partition_size;
+      const int32_t n_block_start_idx = n_start_idx / n_block_size;
+      const int32_t n_num = std::min(n_partition_size, n_size - n_start_idx);
+      const int32_t n_block_num = n_num / n_block_size;
+      // std::printf("thread_id: %d, task_id: %d, n_start_idx: %d, n_num: %d\n",
+      // thread_id, task_id, n_start_idx, n_num);
+
+      // dequant weight
+      {
+        int32_t* __restrict__ curr_q_weight =
+            q_weight + n_block_start_idx * q_weight_block_stride;
+        scalar_t* __restrict__ curr_b_buffer = b_buffer;
+        scalar_t* __restrict__ curr_scales = scales + n_start_idx;
+        int32_t* __restrict__ curr_zeros = zeros + n_start_idx / pack_factor;
+        for (int32_t block_idx = 0; block_idx < n_block_num; ++block_idx) {
+          dequantizer_t::dequant(curr_q_weight, curr_b_buffer, curr_scales,
+                                 curr_zeros, g_idx, scales_group_stride,
+                                 zeros_group_stride, k_size, group_size);
+
+          // if (block_idx == 0 && n_start_idx == 0) {
+          //     print_logits("depacked weight", curr_b_buffer, k_size,
+          //     n_block_size, n_block_size);
+          // }
+
+          // update
+          curr_q_weight += q_weight_block_stride;
+          curr_b_buffer += b_buffer_block_stride;
+          curr_scales += n_block_size;
+          curr_zeros += zeros_block_stride;
+        }
+      }
+
+      // compute loop
+      {
+        const int32_t n_tile_num = n_num / gemm_n_tile_size;
+        scalar_t* __restrict__ curr_input = input;
+        scalar_t* __restrict__ init_bias = bias;
+        if (bias != nullptr) {
+          init_bias += n_start_idx;
+        }
+        scalar_t* __restrict__ init_output = output + n_start_idx;
+        for (int32_t m_idx = 0; m_idx < m_size; m_idx += gemm_m_tile_size) {
+          const int32_t curr_m_size =
+              std::min(gemm_m_tile_size, m_size - m_idx);
+          scalar_t* __restrict__ curr_b_buffer = b_buffer;
+          scalar_t* __restrict__ curr_bias = init_bias;
+          scalar_t* __restrict__ curr_output = init_output;
+          for (int32_t n_tile_idx = 0; n_tile_idx < n_tile_num; ++n_tile_idx) {
+            gemm.gemm(curr_input, curr_b_buffer, c_buffer, curr_m_size, k_size,
+                      input_stride, b_buffer_block_stride, gemm_n_tile_size,
+                      false);
+
+            if (bias != nullptr) {
+              cpu_micro_gemm::bias_epilogue<gemm_n_tile_size>(
+                  c_buffer, curr_output, curr_bias, curr_m_size,
+                  gemm_n_tile_size, output_stride);
+              curr_bias += gemm_n_tile_size;
+            } else {
+              cpu_micro_gemm::default_epilogue<gemm_n_tile_size>(
+                  c_buffer, curr_output, curr_m_size, gemm_n_tile_size,
+                  output_stride);
+            }
+
+            curr_b_buffer +=
+                b_buffer_block_stride * (gemm_n_tile_size / n_block_size);
+            curr_output += gemm_n_tile_size;
+          }
+          curr_input += gemm_m_tile_size * input_stride;
+          init_output += gemm_m_tile_size * output_stride;
+        }
+      }
+    }
+  }
+}
+
+void cpu_gemm_wna16(
+    const torch::Tensor& input,  // [M, K]
+    const torch::Tensor&
+        q_weight,           // [N / 16, K * 16 / pack_factor], packed as int32
+    torch::Tensor& output,  // [M, N]
+    const torch::Tensor& scales,  // [group_num, N]
+    const std::optional<torch::Tensor>&
+        zeros,  // [group_num, N / pack_factor], packed as int32
+    const std::optional<torch::Tensor>& g_idx,  // [K]
+    const std::optional<torch::Tensor>& bias,   // [N]
+    const int64_t pack_factor, const std::string& isa_hint) {
+  using cpu_utils::ISA;
+  TORCH_CHECK_EQ(pack_factor, 8);  // only supports 4bits
+  const int32_t a_m_size = input.size(0);
+  const int32_t a_k_size = input.size(1);
+  const int64_t a_m_stride = input.stride(0);
+  const int32_t b_n_size = q_weight.size(0) * 16;
+  TORCH_CHECK_EQ(a_k_size % 32, 0);
+  TORCH_CHECK_EQ(b_n_size % 32, 0);
+  const int32_t group_num = scales.size(0);
+  const int32_t group_size = a_k_size / group_num;
+  TORCH_CHECK_EQ(group_size % 2, 0);
+  const int64_t scales_group_stride = scales.stride(0);
+  const int64_t output_m_stride = output.stride(0);
+
+  bool has_zp = zeros.has_value();
+  bool use_desc_act = g_idx.has_value();
+  TORCH_CHECK(!(has_zp && use_desc_act));
+
+  ISA isa = [&]() {
+    if (isa_hint == "amx") {
+      return ISA::AMX;
+    } else if (isa_hint == "vec") {
+      return ISA::VEC;
+    } else {
+      TORCH_CHECK(false, "unsupported isa hint: " + isa_hint);
+    }
+  }();
+
+  int32_t* zeros_ptr = has_zp ? zeros->data_ptr<int32_t>() : nullptr;
+  const int64_t zeros_group_stride = has_zp ? zeros->stride(0) : 0;
+  int32_t* g_idx_ptr = use_desc_act ? g_idx->data_ptr<int32_t>() : nullptr;
+
+  VLLM_DISPATCH_16B_TYPES(input.scalar_type(), "cpu_gemm_wna16", [&]() {
+    if (isa == ISA::AMX) {
+      using gemm_t = cpu_micro_gemm::MicroGemm<ISA::AMX, scalar_t>;
+      if (has_zp) {
+        using dequantizer_t = Dequantizer4b<scalar_t, ISA::AMX, true, false>;
+        cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
+            input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
+            output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
+            g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
+            a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
+            scales_group_stride, zeros_group_stride, group_num, group_size,
+            pack_factor);
+        return;
+      }
+      if (use_desc_act) {
+        using dequantizer_t = Dequantizer4b<scalar_t, ISA::AMX, false, true>;
+        cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
+            input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
+            output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
+            g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
+            a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
+            scales_group_stride, zeros_group_stride, group_num, group_size,
+            pack_factor);
+        return;
+      } else {
+        using dequantizer_t = Dequantizer4b<scalar_t, ISA::AMX, false, false>;
+        cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
+            input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
+            output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
+            g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
+            a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
+            scales_group_stride, zeros_group_stride, group_num, group_size,
+            pack_factor);
+        return;
+      }
+    } else if (isa == ISA::VEC) {
+      using gemm_t = cpu_micro_gemm::MicroGemm<ISA::VEC, scalar_t>;
+      if (has_zp) {
+        using dequantizer_t = Dequantizer4b<scalar_t, ISA::VEC, true, false>;
+        cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
+            input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
+            output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
+            g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
+            a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
+            scales_group_stride, zeros_group_stride, group_num, group_size,
+            pack_factor);
+        return;
+      }
+      if (use_desc_act) {
+        using dequantizer_t = Dequantizer4b<scalar_t, ISA::VEC, false, true>;
+        cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
+            input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
+            output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
+            g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
+            a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
+            scales_group_stride, zeros_group_stride, group_num, group_size,
+            pack_factor);
+        return;
+      } else {
+        using dequantizer_t = Dequantizer4b<scalar_t, ISA::VEC, false, false>;
+        cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
+            input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
+            output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
+            g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
+            a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
+            scales_group_stride, zeros_group_stride, group_num, group_size,
+            pack_factor);
+        return;
+      }
+    }
+  });
+}
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@@ -5,6 +5,7 @@
 #include "common/memory.hpp"

 #include "dnnl_helper.h"
+#include "scratchpad_manager.h"

 static dnnl::engine& default_engine() {
  static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
@@ -22,23 +23,6 @@ void release_dnnl_matmul_handler(int64_t handler) {
  delete ptr;
 }

-DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
-  this->realloc(allocation_unit * 128);
-}
-
-void DNNLScratchPadManager::realloc(size_t new_size) {
-  new_size = round(new_size);
-  if (new_size > size_) {
-    ptr_ = std::aligned_alloc(64, new_size);
-    size_ = new_size;
-  }
-}
-
-DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
-  static DNNLScratchPadManager manager;
-  return &manager;
-}
-
 template <typename KT, typename VT>
 class DNNLPrimitiveCache {
 public:
@@ -412,9 +396,9 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
    : DNNLMatMulPrimitiveHandler(
          static_cast<DNNLMatMulPrimitiveHandler::Args>(args), args.ab_type),
      m_size_cache_(nullptr) {
-  assert(ab_type_ == dnnl::memory::data_type::f32 ||
-         ab_type_ == dnnl::memory::data_type::bf16 ||
-         ab_type_ == dnnl::memory::data_type::f16);
+  assert(b_type_ == dnnl::memory::data_type::f32 ||
+         b_type_ == dnnl::memory::data_type::bf16 ||
+         b_type_ == dnnl::memory::data_type::f16);

  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
                                   {b_k_stride_, b_n_stride_});
--- a/csrc/cpu/dnnl_helper.h
+++ b/csrc/cpu/dnnl_helper.h
@@ -59,30 +59,6 @@ constexpr inline dnnl::memory::data_type get_dnnl_type() {
  return DNNLType<std::decay_t<T>>::type;
 }

-class DNNLScratchPadManager {
- public:
-  static constexpr size_t allocation_unit = 4 * 1024 * 1024;  // 4KB
-
-  static DNNLScratchPadManager* get_dnnl_scratchpad_manager();
-
-  DNNLScratchPadManager();
-
-  template <typename T>
-  T* get_data() {
-    return reinterpret_cast<T*>(ptr_);
-  }
-
-  static size_t round(size_t size) {
-    return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
-  }
-
-  void realloc(size_t new_size);
-
- private:
-  size_t size_;
-  void* ptr_;
-};
-
 class DNNLMatMulPrimitiveHandler {
 public:
  virtual ~DNNLMatMulPrimitiveHandler() = default;
--- a/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp
+++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp
@@ -0,0 +1,245 @@
+#ifndef CPU_MICRO_GEMM_AMX_HPP
+#define CPU_MICRO_GEMM_AMX_HPP
+#include "cpu/micro_gemm/cpu_micro_gemm_impl.hpp"
+
+namespace cpu_micro_gemm {
+namespace {
+// AMX specific
+constexpr static int64_t AMX_TILE_ROW_BYTES = 64;
+constexpr static int64_t AMX_TILE_ROW_NUM = 16;
+constexpr static int64_t AMX_TILE_BYTES = AMX_TILE_ROW_BYTES * AMX_TILE_ROW_NUM;
+
+typedef struct __tile_config {
+  uint8_t palette_id = 1;
+  uint8_t start_row = 0;
+  uint8_t reserved_0[14] = {0};
+  uint16_t colsb[16] = {0};
+  uint8_t rows[16] = {0};
+} __tilecfg;
+
+// 2-2-4 pattern, for 16 < m <= 32
+// TILE 0, 1: load A matrix, row num should be 16, m - 16
+// TILE 2, 3: load B matrix, row num should be 16
+// TILE 4, 5, 6, 7: store results C matrix, row num should be 16, 16, m - 16, m
+// - 16
+template <typename scalar_t>
+class TileGemm224 {
+ public:
+  FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    TORCH_CHECK(false, "Unsupported data type for TileGemm224");
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    TORCH_CHECK(false, "Unsupported data type for TileGemm224");
+  }
+};
+
+template <>
+class TileGemm224<c10::BFloat16> {
+ public:
+  using scalar_t = c10::BFloat16;
+  FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    const int32_t k_times = k / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16));
+    c10::BFloat16* __restrict__ a_tile_0 = a_ptr;
+    c10::BFloat16* __restrict__ a_tile_1 = a_ptr + lda * AMX_TILE_ROW_NUM;
+    const int64_t a_tile_stride = lda * sizeof(c10::BFloat16);
+
+    // B is always packed as 16 output channels block
+    c10::BFloat16* __restrict__ b_tile_2 = b_ptr;
+    c10::BFloat16* __restrict__ b_tile_3 = b_ptr + b_n_group_stride;
+    const int32_t b_tile_stride = AMX_TILE_ROW_BYTES;
+
+    float* __restrict__ c_tile_4 = c_ptr;
+    float* __restrict__ c_tile_5 =
+        c_tile_4 + AMX_TILE_ROW_BYTES / sizeof(float);
+    float* __restrict__ c_tile_6 = c_ptr + AMX_TILE_ROW_NUM * ldc;
+    float* __restrict__ c_tile_7 =
+        c_tile_6 + AMX_TILE_ROW_BYTES / sizeof(float);
+    const int32_t c_tile_stride = ldc * sizeof(float);
+
+    if (accum_c) {
+      _tile_loadd(4, c_tile_4, c_tile_stride);
+      _tile_loadd(5, c_tile_5, c_tile_stride);
+      _tile_loadd(6, c_tile_6, c_tile_stride);
+      _tile_loadd(7, c_tile_7, c_tile_stride);
+    } else {
+      _tile_zero(4);
+      _tile_zero(5);
+      _tile_zero(6);
+      _tile_zero(7);
+    }
+
+    for (int32_t k = 0; k < k_times; ++k) {
+      _tile_loadd(0, a_tile_0, a_tile_stride);
+      _tile_stream_loadd(2, b_tile_2, b_tile_stride);
+      _tile_dpbf16ps(4, 0, 2);
+      _tile_stream_loadd(3, b_tile_3, b_tile_stride);
+      _tile_dpbf16ps(5, 0, 3);
+      _tile_loadd(1, a_tile_1, a_tile_stride);
+      _tile_dpbf16ps(6, 1, 2);
+      _tile_dpbf16ps(7, 1, 3);
+
+      // update ptrs
+      a_tile_0 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      a_tile_1 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      b_tile_2 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_3 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    }
+
+    _tile_stored(4, c_tile_4, c_tile_stride);
+    _tile_stored(5, c_tile_5, c_tile_stride);
+    _tile_stored(6, c_tile_6, c_tile_stride);
+    _tile_stored(7, c_tile_7, c_tile_stride);
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    const int32_t m_0 = AMX_TILE_ROW_NUM;
+    const int32_t m_1 = m - AMX_TILE_ROW_NUM;
+    config.rows[0] = m_0;
+    config.rows[1] = m_1;
+    config.rows[2] = AMX_TILE_ROW_NUM;
+    config.rows[3] = AMX_TILE_ROW_NUM;
+    config.rows[4] = m_0;
+    config.rows[5] = m_0;
+    config.rows[6] = m_1;
+    config.rows[7] = m_1;
+    _tile_loadconfig(&config);
+  }
+};
+
+// 1-2-2 pattern, for 0 < m <= 16
+// TILE 0, (1): load A matrix, use extra 1 tile for prefetch, row num should be
+// m, m
+// TILE 2, 3, (4, 5): load B matrix, use extra 2 tiles for prefetch, row
+// num should be 16
+// TILE 6, 7, (6, 7): store results C matrix, row num should be
+// m
+template <typename scalar_t>
+class TileGemm122 {
+ public:
+  FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    TORCH_CHECK(false, "Unsupported data type for TileGemm122");
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    TORCH_CHECK(false, "Unsupported data type for TileGemm122");
+  }
+};
+
+template <>
+class TileGemm122<c10::BFloat16> {
+ public:
+  using scalar_t = c10::BFloat16;
+  FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    c10::BFloat16* __restrict__ a_tile_0 = a_ptr;
+    c10::BFloat16* __restrict__ a_tile_1 =
+        a_ptr + AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+    const int64_t a_tile_stride = lda * sizeof(c10::BFloat16);
+
+    c10::BFloat16* __restrict__ b_tile_2 = b_ptr;
+    c10::BFloat16* __restrict__ b_tile_3 = b_ptr + b_n_group_stride;
+    c10::BFloat16* __restrict__ b_tile_4 =
+        b_tile_2 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    c10::BFloat16* __restrict__ b_tile_5 =
+        b_tile_3 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    int64_t b_stride = AMX_TILE_ROW_BYTES;
+
+    float* __restrict__ c_tile_6 = c_ptr;
+    float* __restrict__ c_tile_7 = c_ptr + AMX_TILE_ROW_BYTES / sizeof(float);
+    int64_t c_stride = ldc * sizeof(float);
+
+    const int32_t k_times = k / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16));
+    const int32_t k_group_times = k_times / 2;
+    const bool has_tail = (k_times % 2 == 1);
+
+    if (accum_c) {
+      _tile_loadd(6, c_tile_6, c_stride);
+      _tile_loadd(7, c_tile_7, c_stride);
+    } else {
+      _tile_zero(6);
+      _tile_zero(7);
+    }
+
+    for (int32_t k = 0; k < k_group_times; ++k) {
+      _tile_loadd(0, a_tile_0, a_tile_stride);
+      _tile_stream_loadd(2, b_tile_2, b_stride);
+      _tile_dpbf16ps(6, 0, 2);
+      _tile_stream_loadd(3, b_tile_3, b_stride);
+      _tile_dpbf16ps(7, 0, 3);
+      _tile_loadd(1, a_tile_1, a_tile_stride);
+      _tile_stream_loadd(4, b_tile_4, b_stride);
+      _tile_dpbf16ps(6, 1, 4);
+      _tile_stream_loadd(5, b_tile_5, b_stride);
+      _tile_dpbf16ps(7, 1, 5);
+
+      // update ptrs
+      a_tile_0 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      a_tile_1 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      b_tile_2 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_3 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_4 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_5 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    }
+
+    if (has_tail) {
+      _tile_loadd(0, a_tile_0, a_tile_stride);
+      _tile_stream_loadd(2, b_tile_2, b_stride);
+      _tile_dpbf16ps(6, 0, 2);
+      _tile_stream_loadd(3, b_tile_3, b_stride);
+      _tile_dpbf16ps(7, 0, 3);
+    }
+
+    _tile_stored(6, c_tile_6, c_stride);
+    _tile_stored(7, c_tile_7, c_stride);
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    config.rows[0] = m;
+    config.rows[1] = m;
+    config.rows[2] = AMX_TILE_ROW_NUM;
+    config.rows[3] = AMX_TILE_ROW_NUM;
+    config.rows[4] = AMX_TILE_ROW_NUM;
+    config.rows[5] = AMX_TILE_ROW_NUM;
+    config.rows[6] = m;
+    config.rows[7] = m;
+    _tile_loadconfig(&config);
+  }
+};
+}  // namespace
+
+// Gemm kernel uses AMX, requires B matrix to be packed
+template <typename scalar_t>
+class MicroGemm<cpu_utils::ISA::AMX, scalar_t> {
+ public:
+  static constexpr int32_t MaxMSize = 32;
+  static constexpr int32_t NSize = 32;
+
+ public:
+  MicroGemm() : curr_m_(-1) {
+    vec_op::unroll_loop<int, 8>([&](int i) { amx_tile_config_.colsb[i] = 64; });
+  }
+
+  void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    if (m > AMX_TILE_ROW_NUM) {
+      if (m != curr_m_) {
+        curr_m_ = m;
+        TileGemm224<scalar_t>::init_tile_config(m, amx_tile_config_);
+      }
+      TileGemm224<scalar_t>::gemm(CPU_MICRO_GEMM_PARAMS);
+    } else {
+      if (m != curr_m_) {
+        curr_m_ = m;
+        TileGemm122<scalar_t>::init_tile_config(m, amx_tile_config_);
+      }
+      TileGemm122<scalar_t>::gemm(CPU_MICRO_GEMM_PARAMS);
+    }
+  }
+
+ private:
+  alignas(64) __tilecfg amx_tile_config_;
+  int32_t curr_m_;
+};
+
+}  // namespace cpu_micro_gemm
+
+#endif
--- a/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp
+++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp
@@ -0,0 +1,91 @@
+#ifndef CPU_MICRO_GEMM_IMPL_HPP
+#define CPU_MICRO_GEMM_IMPL_HPP
+#include "cpu/utils.hpp"
+#include "cpu/cpu_types.hpp"
+
+namespace cpu_micro_gemm {
+#define DEFINE_CPU_MICRO_GEMM_PARAMS                                        \
+  scalar_t *__restrict__ a_ptr, scalar_t *__restrict__ b_ptr,               \
+      float *__restrict__ c_ptr, const int32_t m, const int32_t k,          \
+      const int64_t lda, const int64_t b_n_group_stride, const int64_t ldc, \
+      const bool accum_c
+
+#define CPU_MICRO_GEMM_PARAMS \
+  a_ptr, b_ptr, c_ptr, m, k, lda, b_n_group_stride, ldc, accum_c
+
+template <cpu_utils::ISA isa, typename scalar_t>
+class MicroGemm {
+ public:
+  static constexpr int32_t MaxMSize = 16;
+  static constexpr int32_t NSize = 16;
+
+ public:
+  void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    TORCH_CHECK(false, "Unimplemented MicroGemm.");
+  }
+};
+
+template <int32_t n_size, typename scalar_t>
+FORCE_INLINE void default_epilogue(float* __restrict__ c_ptr,
+                                   scalar_t* __restrict__ d_ptr,
+                                   const int32_t m, const int64_t ldc,
+                                   const int64_t ldd) {
+  using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+  static_assert(n_size % 16 == 0);
+
+  float* __restrict__ curr_c = c_ptr;
+  scalar_t* __restrict__ curr_d = d_ptr;
+  for (int32_t i = 0; i < m; ++i) {
+    float* __restrict__ curr_c_iter = curr_c;
+    scalar_t* __restrict__ curr_d_iter = curr_d;
+    vec_op::unroll_loop<int32_t, n_size / 16>([&](int32_t n_g_idx) {
+      vec_op::FP32Vec16 c_vec_fp32(curr_c_iter);
+      scalar_vec_t c_vec(c_vec_fp32);
+      c_vec.save(curr_d_iter);
+      curr_c_iter += 16;
+      curr_d_iter += 16;
+    });
+    curr_c += ldc;
+    curr_d += ldd;
+  }
+}
+
+template <int32_t n_size, typename scalar_t>
+FORCE_INLINE void bias_epilogue(float* __restrict__ c_ptr,
+                                scalar_t* __restrict__ d_ptr,
+                                scalar_t* __restrict__ bias_ptr,
+                                const int32_t m, const int64_t ldc,
+                                const int64_t ldd) {
+  using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+  static_assert(n_size % 16 == 0);
+  constexpr int32_t n_group_num = n_size / 16;
+  static_assert(n_group_num <= 16);
+
+  vec_op::FP32Vec16 bias_vecs[n_group_num];
+  scalar_t* __restrict__ curr_bias = bias_ptr;
+  vec_op::unroll_loop<int32_t, n_group_num>([&](int32_t i) {
+    scalar_vec_t vec(curr_bias);
+    bias_vecs[i] = vec_op::FP32Vec16(vec);
+    curr_bias += 16;
+  });
+
+  float* __restrict__ curr_c = c_ptr;
+  scalar_t* __restrict__ curr_d = d_ptr;
+  for (int32_t i = 0; i < m; ++i) {
+    float* __restrict__ curr_c_iter = curr_c;
+    scalar_t* __restrict__ curr_d_iter = curr_d;
+    vec_op::unroll_loop<int32_t, n_group_num>([&](int32_t n_g_idx) {
+      vec_op::FP32Vec16 c_vec_fp32(curr_c_iter);
+      c_vec_fp32 = c_vec_fp32 + bias_vecs[n_g_idx];
+      scalar_vec_t c_vec(c_vec_fp32);
+      c_vec.save(curr_d_iter);
+      curr_c_iter += 16;
+      curr_d_iter += 16;
+    });
+    curr_c += ldc;
+    curr_d += ldd;
+  }
+}
+}  // namespace cpu_micro_gemm
+
+#endif
--- a/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp
+++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp
@@ -0,0 +1,115 @@
+#ifndef CPU_MICRO_GEMM_VEC_HPP
+#define CPU_MICRO_GEMM_VEC_HPP
+#include "cpu/micro_gemm/cpu_micro_gemm_impl.hpp"
+
+namespace cpu_micro_gemm {
+namespace {
+// 8-2-16 pattern, 8 regs for A, 2 regs for B, 16 regs for C, [8, K] @ [k, 32]
+template <typename scalar_t>
+class TileGemm82 {
+ public:
+  FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    switch (m) {
+      case 1:
+        gemm_micro<1>(CPU_MICRO_GEMM_PARAMS);
+        break;
+      case 2:
+        gemm_micro<2>(CPU_MICRO_GEMM_PARAMS);
+        break;
+      case 3:
+        gemm_micro<3>(CPU_MICRO_GEMM_PARAMS);
+        break;
+      case 4:
+        gemm_micro<4>(CPU_MICRO_GEMM_PARAMS);
+        break;
+      case 5:
+        gemm_micro<5>(CPU_MICRO_GEMM_PARAMS);
+        break;
+      case 6:
+        gemm_micro<6>(CPU_MICRO_GEMM_PARAMS);
+        break;
+      case 7:
+        gemm_micro<7>(CPU_MICRO_GEMM_PARAMS);
+        break;
+      case 8:
+        gemm_micro<8>(CPU_MICRO_GEMM_PARAMS);
+        break;
+    }
+  }
+
+  template <int32_t M>
+  static void gemm_micro(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    static_assert(0 < M <= 8);
+    using load_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+
+    scalar_t* __restrict__ curr_b_0 = b_ptr;
+    scalar_t* __restrict__ curr_b_1 = b_ptr + b_n_group_stride;
+    float* __restrict__ curr_c_0 = c_ptr;
+    float* __restrict__ curr_c_1 = c_ptr + 16;
+
+    vec_op::FP32Vec16 c_regs[M * 2];
+    if (accum_c) {
+      float* __restrict__ curr_m_c_0 = curr_c_0;
+      float* __restrict__ curr_m_c_1 = curr_c_1;
+      vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+        c_regs[i * 2] = vec_op::FP32Vec16(curr_m_c_0);
+        c_regs[i * 2 + 1] = vec_op::FP32Vec16(curr_m_c_1);
+
+        // update
+        curr_m_c_0 += ldc;
+        curr_m_c_1 += ldc;
+      });
+    }
+
+    scalar_t* __restrict__ curr_a = a_ptr;
+    for (int32_t k_idx = 0; k_idx < k; ++k_idx) {
+      load_vec_t b_0_reg(curr_b_0);
+      vec_op::FP32Vec16 fp32_b_0_reg(b_0_reg);
+      load_vec_t b_1_reg(curr_b_1);
+      vec_op::FP32Vec16 fp32_b_1_reg(b_1_reg);
+
+      scalar_t* __restrict__ curr_m_a = curr_a;
+      vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+        scalar_t v = *curr_m_a;
+        load_vec_t a_reg_original(v);
+        vec_op::FP32Vec16 a_reg(a_reg_original);
+        c_regs[i * 2] = c_regs[i * 2] + a_reg * fp32_b_0_reg;
+        c_regs[i * 2 + 1] = c_regs[i * 2 + 1] + a_reg * fp32_b_1_reg;
+
+        // update
+        curr_m_a += lda;
+      });
+
+      // update
+      curr_a += 1;
+      curr_b_0 += 16;
+      curr_b_1 += 16;
+    }
+
+    vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+      c_regs[i * 2].save(curr_c_0);
+      c_regs[i * 2 + 1].save(curr_c_1);
+
+      // update
+      curr_c_0 += ldc;
+      curr_c_1 += ldc;
+    });
+  }
+};
+}  // namespace
+
+// Gemm kernel uses vector instructions, requires B matrix to be packed
+template <typename scalar_t>
+class MicroGemm<cpu_utils::ISA::VEC, scalar_t> {
+ public:
+  static constexpr int32_t MaxMSize = 8;
+  static constexpr int32_t NSize = 32;
+
+ public:
+  void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    TileGemm82<scalar_t>::gemm(CPU_MICRO_GEMM_PARAMS);
+  }
+};
+}  // namespace cpu_micro_gemm
+
+#endif
--- a/csrc/cpu/scratchpad_manager.cpp
+++ b/csrc/cpu/scratchpad_manager.cpp
@@ -0,0 +1,23 @@
+#include <cstdlib>
+
+#include "scratchpad_manager.h"
+
+DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
+  this->realloc(allocation_unit * 128);
+}
+
+void DNNLScratchPadManager::realloc(size_t new_size) {
+  new_size = round(new_size);
+  if (new_size > size_) {
+    if (ptr_ != nullptr) {
+      std::free(ptr_);
+    }
+    ptr_ = std::aligned_alloc(64, new_size);
+    size_ = new_size;
+  }
+}
+
+DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
+  static DNNLScratchPadManager manager;
+  return &manager;
+}
--- a/csrc/cpu/scratchpad_manager.h
+++ b/csrc/cpu/scratchpad_manager.h
@@ -0,0 +1,31 @@
+#ifndef SCRATCHPAD_MANAGER_H
+#define SCRATCHPAD_MANAGER_H
+
+#include <cstddef>
+#include <cstdio>
+
+class DNNLScratchPadManager {
+ public:
+  static constexpr size_t allocation_unit = 4 * 1024;  // 4KB
+
+  static DNNLScratchPadManager* get_dnnl_scratchpad_manager();
+
+  DNNLScratchPadManager();
+
+  template <typename T>
+  T* get_data() {
+    return reinterpret_cast<T*>(ptr_);
+  }
+
+  static size_t round(size_t size) {
+    return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
+  }
+
+  void realloc(size_t new_size);
+
+ private:
+  size_t size_;
+  void* ptr_;
+};
+
+#endif
--- a/csrc/cpu/shm.cpp
+++ b/csrc/cpu/shm.cpp
@@ -192,7 +192,7 @@ class SHMManager {
                      const int group_size)
      : _rank(rank),
        _group_size(group_size),
-        _thread_num(torch::get_num_threads()),
+        _thread_num(omp_get_max_threads()),
        _shm_names({""}),
        _shared_mem_ptrs({nullptr}),
        _shm_ctx(nullptr) {
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -74,25 +74,45 @@ at::Tensor int8_scaled_mm_with_quant(at::Tensor& mat1, at::Tensor& mat2,
                                     const std::optional<at::Tensor>& bias,
                                     at::ScalarType out_dtype, bool is_vnni);

+torch::Tensor get_scheduler_metadata(
+    const int64_t num_req, const int64_t num_heads_q,
+    const int64_t num_heads_kv, const int64_t head_dim,
+    const torch::Tensor& seq_lens, at::ScalarType dtype,
+    const torch::Tensor& query_start_loc, const bool casual,
+    const int64_t window_size, const std::string& isa_hint,
+    const bool enable_kv_split);
+
+void cpu_attn_reshape_and_cache(const torch::Tensor& key,
+                                const torch::Tensor& value,
+                                torch::Tensor& key_cache,
+                                torch::Tensor& value_cache,
+                                const torch::Tensor& slot_mapping,
+                                const std::string& isa);
+
+void cpu_attention_with_kv_cache(
+    const torch::Tensor& query, const torch::Tensor& key_cache,
+    const torch::Tensor& value_cache, torch::Tensor& output,
+    const torch::Tensor& query_start_loc, const torch::Tensor& seq_lens,
+    const double scale, const bool causal,
+    const std::optional<torch::Tensor>& alibi_slopes,
+    const int64_t sliding_window_left, const int64_t sliding_window_right,
+    const torch::Tensor& block_table, const double softcap,
+    const torch::Tensor& scheduler_metadata,
+    const std::optional<torch::Tensor>& s_aux);
+
+// Note: just for avoiding importing errors
+void placeholder_op() { TORCH_CHECK(false, "Unimplemented"); }
+
+void cpu_gemm_wna16(const torch::Tensor& input, const torch::Tensor& q_weight,
+                    torch::Tensor& output, const torch::Tensor& scales,
+                    const std::optional<torch::Tensor>& zeros,
+                    const std::optional<torch::Tensor>& g_idx,
+                    const std::optional<torch::Tensor>& bias,
+                    const int64_t pack_factor, const std::string& isa_hint);
+
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // vLLM custom ops

-  // Attention ops
-  // Compute the attention between an input query and the cached keys/values
-  // using PagedAttention.
-  ops.def(
-      "paged_attention_v1("
-      "    Tensor! out, Tensor query, Tensor key_cache,"
-      "    Tensor value_cache, int num_kv_heads, float scale,"
-      "    Tensor block_tables, Tensor seq_lens, int block_size,"
-      "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
-      "    int tp_rank, int blocksparse_local_blocks,"
-      "    int blocksparse_vert_stride, int blocksparse_block_size,"
-      "    int blocksparse_head_sliding_step) -> ()");
-
-  ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);
-
  ops.def(
      "dynamic_4bit_int_moe("
      "Tensor x, Tensor topk_ids, Tensor topk_weights,"
@@ -102,20 +122,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {

  ops.impl("dynamic_4bit_int_moe", torch::kCPU, &dynamic_4bit_int_moe_cpu);

-  // PagedAttention V2.
-  ops.def(
-      "paged_attention_v2("
-      "    Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
-      "    Tensor! tmp_out, Tensor query, Tensor key_cache,"
-      "    Tensor value_cache, int num_kv_heads, float scale,"
-      "    Tensor block_tables, Tensor seq_lens, int block_size,"
-      "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
-      "    int tp_rank, int blocksparse_local_blocks,"
-      "    int blocksparse_vert_stride, int blocksparse_block_size,"
-      "    int blocksparse_head_sliding_step) -> ()");
-  ops.impl("paged_attention_v2", torch::kCPU, &paged_attention_v2);
-
  // Activation ops

  // Activation function used in SwiGLU.
@@ -166,7 +172,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // Quantization
 #if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) || \
    defined(__powerpc64__)
-  at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
  // Helper function to release oneDNN handlers
  ops.def("release_dnnl_matmul_handler(int handler) -> ()",
          &release_dnnl_matmul_handler);
@@ -202,15 +207,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // Compute int8 quantized tensor for given scaling factor.
  ops.def(
      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
-      "Tensor? azp) -> ()",
-      {stride_tag});
+      "Tensor? azp) -> ()");
  ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);

  // Compute int8 quantized tensor and scaling factor
  ops.def(
      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
-      "Tensor!? azp) -> ()",
-      {stride_tag});
+      "Tensor!? azp) -> ()");
  ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
           &dynamic_scaled_int8_quant);
 #endif
@@ -259,37 +262,40 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.impl("int8_scaled_mm_with_quant", torch::kCPU,
           &int8_scaled_mm_with_quant);
 #endif
-}

-TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
-  // Cache ops
-  // Swap in (out) the cache blocks from src to dst.
-  cache_ops.def(
-      "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
-  cache_ops.impl("swap_blocks", torch::kCPU, &swap_blocks);
+  // CPU attention kernels
+  ops.def(
+      "get_scheduler_metadata(int num_req, int num_heads_q, int num_heads_kv, "
+      "int head_dim, Tensor seq_lens, ScalarType dtype, Tensor "
+      "query_start_loc, bool casual, int window_size, str isa_hint, bool "
+      "enable_kv_split) -> Tensor",
+      &get_scheduler_metadata);
+  ops.def(
+      "cpu_attn_reshape_and_cache(Tensor key, Tensor value, Tensor(a2!) "
+      "key_cache, Tensor(a3!) value_cache, Tensor slot_mapping, str "
+      "isa) -> ()",
+      &cpu_attn_reshape_and_cache);
+  ops.def(
+      "cpu_attention_with_kv_cache(Tensor query, Tensor key_cache, Tensor "
+      "value_cache, Tensor(a3!) output, Tensor query_start_loc, Tensor "
+      "seq_lens, float scale, bool causal, Tensor? alibi_slopes, SymInt "
+      "sliding_window_left, SymInt sliding_window_right, Tensor block_table, "
+      "float softcap, Tensor sheduler_metadata, Tensor? s_aux) -> ()",
+      &cpu_attention_with_kv_cache);

-  // Copy the cache blocks from src to dst.
-  cache_ops.def(
-      "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
-      "Tensor block_mapping) -> ()");
-  cache_ops.impl("copy_blocks", torch::kCPU, &copy_blocks);
+  // placeholders
+  ops.def("static_scaled_fp8_quant() -> ()", placeholder_op);
+  ops.def("dynamic_scaled_fp8_quant() -> ()", placeholder_op);
+  ops.def("dynamic_per_token_scaled_fp8_quant() -> ()", placeholder_op);

-  // Reshape the key and value tensors and cache them.
-  cache_ops.def(
-      "reshape_and_cache(Tensor key, Tensor value,"
-      "                  Tensor! key_cache, Tensor! value_cache,"
-      "                  Tensor slot_mapping,"
-      "                  str kv_cache_dtype,"
-      "                  Tensor k_scale, Tensor v_scale) -> ()");
-  cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
-
-  cache_ops.def(
-      "concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
-      "                     Tensor! kv_cache,"
-      "                     Tensor slot_mapping,"
-      "                     str kv_cache_dtype,"
-      "                     Tensor scale) -> ()");
-  cache_ops.impl("concat_and_cache_mla", torch::kCPU, &concat_and_cache_mla);
+  // WNA16
+#if defined(__AVX512F__)
+  ops.def(
+      "cpu_gemm_wna16(Tensor input, Tensor q_weight, Tensor(a2!) output, "
+      "Tensor scales, Tensor? zeros, Tensor? g_idx, Tensor? bias, SymInt "
+      "pack_factor, str isa_hint) -> ()");
+  ops.impl("cpu_gemm_wna16", torch::kCPU, &cpu_gemm_wna16);
+#endif
 }

 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -45,21 +45,55 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
  // Memory node binding
  if (numa_available() != -1) {
    int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front());
-    bitmask* mask = numa_parse_nodestring(std::to_string(mem_node_id).c_str());
-    bitmask* src_mask = numa_get_membind();
-
-    int pid = getpid();
-
-    // move all existing pages to the specified numa node.
-    *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
-    int page_num = numa_migrate_pages(pid, src_mask, mask);
-    if (page_num == -1) {
-      TORCH_WARN("numa_migrate_pages failed. errno: " + std::to_string(errno));
+    std::set<int> node_ids;
+    for (const auto& cpu_id : omp_cpu_ids) {
+      int node_id = numa_node_of_cpu(cpu_id);
+      if (node_id != -1) {
+        node_ids.insert(node_id);
+      }
+      if (node_id != mem_node_id) {
+        TORCH_WARN("CPU ", cpu_id, " is on NUMA node ", node_id, ", but CPU ",
+                   omp_cpu_ids.front(), " is on NUMA node ", mem_node_id,
+                   ". All CPUs should be on the same NUMA node for optimal "
+                   "performance. Memory will be bound to NUMA node ",
+                   mem_node_id, ".");
+      }
    }
+    // Concatenate all node_ids into a single comma-separated string
+    if (!node_ids.empty()) {
+      std::string node_ids_str;
+      for (const int node_id : node_ids) {
+        if (!node_ids_str.empty()) {
+          node_ids_str += ",";
+        }
+        node_ids_str += std::to_string(node_id);
+      }

-    // restrict memory allocation node.
-    numa_set_membind(mask);
-    numa_set_strict(1);
+      bitmask* mask = numa_parse_nodestring(node_ids_str.c_str());
+      bitmask* src_mask = numa_get_membind();
+
+      int pid = getpid();
+
+      if (mask && src_mask) {
+        // move all existing pages to the specified numa node.
+        *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
+        int page_num = numa_migrate_pages(pid, src_mask, mask);
+        if (page_num == -1) {
+          TORCH_WARN("numa_migrate_pages failed. errno: " +
+                     std::to_string(errno));
+        }
+
+        // restrict memory allocation node.
+        numa_set_membind(mask);
+        numa_set_strict(1);
+
+        numa_free_nodemask(mask);
+        numa_free_nodemask(src_mask);
+      } else {
+        TORCH_WARN("numa_parse_nodestring or numa_get_membind failed. errno: " +
+                   std::to_string(errno));
+      }
+    }
  }

  // OMP threads binding
--- a/csrc/cpu/utils.hpp
+++ b/csrc/cpu/utils.hpp
@@ -0,0 +1,73 @@
+#ifndef UTILS_HPP
+#define UTILS_HPP
+
+#include <atomic>
+#include <cassert>
+#include <cstdint>
+#include <unistd.h>
+
+#if defined(__APPLE__)
+  #include <sys/sysctl.h>
+#endif
+
+#include "cpu_types.hpp"
+
+namespace cpu_utils {
+enum class ISA { AMX, VEC };
+
+template <typename T>
+struct VecTypeTrait {
+  using vec_t = void;
+};
+
+template <>
+struct VecTypeTrait<float> {
+  using vec_t = vec_op::FP32Vec16;
+};
+
+#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
+template <>
+struct VecTypeTrait<c10::BFloat16> {
+  using vec_t = vec_op::BF16Vec16;
+};
+#endif
+
+template <>
+struct VecTypeTrait<c10::Half> {
+  using vec_t = vec_op::FP16Vec16;
+};
+
+struct Counter {
+  std::atomic<int64_t> counter;
+  char _padding[56];
+
+  Counter() : counter(0) {}
+
+  void reset_counter() { counter.store(0); }
+
+  int64_t acquire_counter() { return counter++; }
+};
+
+inline int64_t get_l2_size() {
+  static int64_t size = []() {
+#if defined(__APPLE__)
+    // macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname.
+    int64_t l2_cache_size = 0;
+    size_t len = sizeof(l2_cache_size);
+    if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 &&
+        l2_cache_size > 0) {
+      return l2_cache_size >> 1;  // use 50% of L2 cache
+    }
+    // Fallback if sysctlbyname fails
+    return 128LL * 1024 >> 1;  // use 50% of 128KB
+#else
+    long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
+    assert(l2_cache_size != -1);
+    return l2_cache_size >> 1;  // use 50% of L2 cache
+#endif
+  }();
+  return size;
+}
+}  // namespace cpu_utils
+
+#endif
--- a/csrc/cuda_view.cu
+++ b/csrc/cuda_view.cu
@@ -22,15 +22,10 @@ torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) {
  auto strides = cpu_tensor.strides();
  auto options = cpu_tensor.options().device(torch::kCUDA);

-  // from_blob signature: from_blob(void *data, IntArrayRef sizes, ..., Deleter,
-  // const TensorOptions &) Provide a no-op deleter. The CPU tensor holds the
-  // memory, so we don't free it here.
-  auto deleter = [](void*) {
-    // no-op, since the memory is owned by the original CPU tensor
-  };
-
+  // use default no-op deleter, since the memory is owned by the original CPU
+  // tensor
  torch::Tensor cuda_tensor =
-      torch::from_blob(device_ptr, sizes, strides, deleter, options);
+      torch::from_blob(device_ptr, sizes, strides, options);

  TORCH_CHECK(cuda_tensor.device().is_cuda(),
              "Resulting tensor is not on CUDA device");
--- a/csrc/cumem_allocator.cpp
+++ b/csrc/cumem_allocator.cpp
@@ -3,14 +3,58 @@
 // need to be unsigned long long
 #include <iostream>

+#include "cumem_allocator_compat.h"
+
+#ifndef USE_ROCM
+static const char* PYARGS_PARSE = "KKKK";
+#else
+  #include <cstdlib>
+  #include <cerrno>
+  #include <climits>
+
+// Default chunk size 256MB for ROCm. Can be overridden at runtime by the
+// environment variable VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE, specified in megabytes
+// (MB). The env value is parsed with strtoull as an integer number of MB
+// (decimal or 0x hex). The parsed MB value is converted to bytes. If
+// parsing fails, the value is 0, or the multiplication would overflow,
+// the default (256MB) is used.
+static const unsigned long long DEFAULT_MEMCREATE_CHUNK_SIZE =
+    (256ULL * 1024ULL * 1024ULL);
+
+static unsigned long long get_memcreate_chunk_size() {
+  const char* env = getenv("VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE");
+  if (!env) return DEFAULT_MEMCREATE_CHUNK_SIZE;
+  char* endptr = nullptr;
+  errno = 0;
+  unsigned long long val_mb = strtoull(env, &endptr, 0);
+  if (endptr == env || errno != 0) {
+    // parsing failed, fallback to default
+    return DEFAULT_MEMCREATE_CHUNK_SIZE;
+  }
+  if (val_mb == 0) return DEFAULT_MEMCREATE_CHUNK_SIZE;
+
+  const unsigned long long MB = 1024ULL * 1024ULL;
+  // guard against overflow when converting MB -> bytes
+  if (val_mb > (ULLONG_MAX / MB)) {
+    return DEFAULT_MEMCREATE_CHUNK_SIZE;
+  }
+  return val_mb * MB;
+}
+
+static inline unsigned long long my_min(unsigned long long a,
+                                        unsigned long long b) {
+  return a < b ? a : b;
+}
+
+static const char* PYARGS_PARSE = "KKKO";
+#endif
+
 extern "C" {

 #define PY_SSIZE_T_CLEAN
 #include <Python.h>

 #include <sys/types.h>
-#include <cuda_runtime_api.h>
-#include <cuda.h>

 char error_msg[10240];  // 10KB buffer to store error messages
 CUresult no_error = CUresult(0);
@@ -49,7 +93,12 @@ void ensure_context(unsigned long long device) {
 }

 void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
+#ifndef USE_ROCM
                    CUmemGenericAllocationHandle* p_memHandle) {
+#else
+                    CUmemGenericAllocationHandle** p_memHandle,
+                    unsigned long long* chunk_sizes, size_t num_chunks) {
+#endif
  ensure_context(device);
  // Define memory allocation properties
  CUmemAllocationProp prop = {};
@@ -58,6 +107,7 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
  prop.location.id = device;
  prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE;

+#ifndef USE_ROCM
  // Allocate memory using cuMemCreate
  CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
  if (error_code != 0) {
@@ -67,6 +117,39 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
  if (error_code != 0) {
    return;
  }
+#else
+  for (auto i = 0; i < num_chunks; ++i) {
+    CUDA_CHECK(cuMemCreate(p_memHandle[i], chunk_sizes[i], &prop, 0));
+    if (error_code != 0) {
+      // Clean up previously created handles
+      for (auto j = 0; j < i; ++j) {
+        cuMemRelease(*(p_memHandle[j]));
+      }
+      return;
+    }
+  }
+  unsigned long long allocated_size = 0;
+  for (auto i = 0; i < num_chunks; ++i) {
+    void* map_addr = (void*)((uintptr_t)d_mem + allocated_size);
+    CUDA_CHECK(cuMemMap(map_addr, chunk_sizes[i], 0, *(p_memHandle[i]), 0));
+    if (error_code != 0) {
+      // unmap previously mapped chunks
+      unsigned long long unmapped_size = 0;
+      for (auto j = 0; j < i; ++j) {
+        void* unmap_addr = (void*)((uintptr_t)d_mem + unmapped_size);
+        cuMemUnmap(unmap_addr, chunk_sizes[j]);
+        unmapped_size += chunk_sizes[j];
+      }
+      // release all created handles
+      for (auto j = 0; j < num_chunks; ++j) {
+        cuMemRelease(*(p_memHandle[j]));
+      }
+      return;
+    }
+    allocated_size += chunk_sizes[i];
+  }
+#endif
+
  CUmemAccessDesc accessDesc = {};
  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
  accessDesc.location.id = device;
@@ -82,10 +165,16 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,

 void unmap_and_release(unsigned long long device, ssize_t size,
                       CUdeviceptr d_mem,
+#ifndef USE_ROCM
                       CUmemGenericAllocationHandle* p_memHandle) {
+#else
+                       CUmemGenericAllocationHandle** p_memHandle,
+                       unsigned long long* chunk_sizes, size_t num_chunks) {
+#endif
  // std::cout << "unmap_and_release: device=" << device << ", size=" << size <<
  // ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
  ensure_context(device);
+#ifndef USE_ROCM
  CUDA_CHECK(cuMemUnmap(d_mem, size));
  if (error_code != 0) {
    return;
@@ -94,6 +183,30 @@ void unmap_and_release(unsigned long long device, ssize_t size,
  if (error_code != 0) {
    return;
  }
+#else
+  unsigned long long allocated_size = 0;
+  CUresult first_error = no_error;
+
+  for (auto i = 0; i < num_chunks; ++i) {
+    void* map_addr = (void*)((uintptr_t)d_mem + allocated_size);
+    CUresult status = cuMemUnmap(map_addr, chunk_sizes[i]);
+    if (status != no_error && first_error == no_error) {
+      first_error = status;
+    }
+    allocated_size += chunk_sizes[i];
+  }
+
+  for (auto i = 0; i < num_chunks; ++i) {
+    CUresult status = cuMemRelease(*(p_memHandle[i]));
+    if (status != no_error && first_error == no_error) {
+      first_error = status;
+    }
+  }
+
+  if (first_error != no_error) {
+    CUDA_CHECK(first_error);
+  }
+#endif
 }

 PyObject* create_tuple_from_c_integers(unsigned long long a,
@@ -120,6 +233,36 @@ PyObject* create_tuple_from_c_integers(unsigned long long a,
  return tuple;  // Return the created tuple
 }

+PyObject* create_tuple_from_c_mixed(unsigned long long a, unsigned long long b,
+                                    unsigned long long c,
+                                    CUmemGenericAllocationHandle** vec,
+                                    unsigned long long* chunk_sizes,
+                                    size_t num_chunks) {
+  PyObject* tuple = PyTuple_New(4);
+  if (!tuple) {
+    return NULL;
+  }
+
+  // PyObject* list = PyList_New(vec.size());
+  PyObject* list = PyList_New(num_chunks);
+  for (auto i = 0; i < num_chunks; ++i) {
+    PyObject* addr_size_pair = PyTuple_New(2);
+    PyObject* addr = PyLong_FromUnsignedLongLong((unsigned long long)(vec[i]));
+    PyObject* size =
+        PyLong_FromUnsignedLongLong((unsigned long long)(chunk_sizes[i]));
+    PyTuple_SetItem(addr_size_pair, 0, addr);
+    PyTuple_SetItem(addr_size_pair, 1, size);
+    PyList_SetItem(list, i, addr_size_pair);
+  }
+
+  PyTuple_SetItem(tuple, 0, PyLong_FromUnsignedLongLong(a));
+  PyTuple_SetItem(tuple, 1, PyLong_FromUnsignedLongLong(b));
+  PyTuple_SetItem(tuple, 2, PyLong_FromUnsignedLongLong(c));
+  PyTuple_SetItem(tuple, 3, list);
+
+  return tuple;
+}
+
 // ---------------------------------------------------------------------------
 // Our exported C functions that call Python:

@@ -147,14 +290,55 @@ void* my_malloc(ssize_t size, int device, CUstream stream) {
  size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;

  CUdeviceptr d_mem;
+#ifndef USE_ROCM
  CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, 0, 0, 0));
  if (error_code != 0) {
    return nullptr;
  }
+#else
+  CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, granularity, 0, 0));
+  if (error_code != 0) {
+    return nullptr;
+  }
+#endif
+
+#ifndef USE_ROCM
  // allocate the CUmemGenericAllocationHandle
  CUmemGenericAllocationHandle* p_memHandle =
      (CUmemGenericAllocationHandle*)malloc(
          sizeof(CUmemGenericAllocationHandle));
+#else
+  // Make sure chunk size is aligned with hardware granularity. The base
+  // chunk size can be configured via environment variable
+  // ``VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE``; otherwise
+  // DEFAULT_MEMCREATE_CHUNK_SIZE is used.
+  size_t base_chunk = (size_t)get_memcreate_chunk_size();
+  size_t aligned_chunk_size =
+      ((base_chunk + granularity - 1) / granularity) * granularity;
+  size_t num_chunks =
+      (alignedSize + aligned_chunk_size - 1) / aligned_chunk_size;
+  CUmemGenericAllocationHandle** p_memHandle =
+      (CUmemGenericAllocationHandle**)malloc(
+          num_chunks * sizeof(CUmemGenericAllocationHandle*));
+  unsigned long long* chunk_sizes =
+      (unsigned long long*)malloc(num_chunks * sizeof(unsigned long long));
+  for (auto i = 0; i < num_chunks; ++i) {
+    p_memHandle[i] = (CUmemGenericAllocationHandle*)malloc(
+        sizeof(CUmemGenericAllocationHandle));
+    if (p_memHandle[i] == nullptr) {
+      std::cerr << "ERROR: malloc failed for p_memHandle[" << i << "].\n";
+      for (auto j = 0; j < i; ++j) {
+        free(p_memHandle[j]);
+      }
+      free(p_memHandle);
+      free(chunk_sizes);
+      return nullptr;
+    }
+    chunk_sizes[i] = (unsigned long long)my_min(
+        (unsigned long long)(alignedSize - i * aligned_chunk_size),
+        (unsigned long long)aligned_chunk_size);
+  }
+#endif

  if (!g_python_malloc_callback) {
    std::cerr << "ERROR: g_python_malloc_callback not set.\n";
@@ -164,9 +348,15 @@ void* my_malloc(ssize_t size, int device, CUstream stream) {
  // Acquire GIL (not in stable ABI officially, but often works)
  PyGILState_STATE gstate = PyGILState_Ensure();

+#ifndef USE_ROCM
  PyObject* arg_tuple = create_tuple_from_c_integers(
      (unsigned long long)device, (unsigned long long)alignedSize,
      (unsigned long long)d_mem, (unsigned long long)p_memHandle);
+#else
+  PyObject* arg_tuple = create_tuple_from_c_mixed(
+      (unsigned long long)device, (unsigned long long)alignedSize,
+      (unsigned long long)d_mem, p_memHandle, chunk_sizes, num_chunks);
+#endif

  // Call g_python_malloc_callback
  PyObject* py_result =
@@ -182,7 +372,27 @@ void* my_malloc(ssize_t size, int device, CUstream stream) {
  PyGILState_Release(gstate);

  // do the final mapping
+#ifndef USE_ROCM
  create_and_map(device, alignedSize, d_mem, p_memHandle);
+#else
+  create_and_map(device, alignedSize, d_mem, p_memHandle, chunk_sizes,
+                 num_chunks);
+  free(chunk_sizes);
+#endif
+
+  if (error_code != 0) {
+    // free address and the handle
+    CUDA_CHECK(cuMemAddressFree(d_mem, alignedSize));
+#ifndef USE_ROCM
+    free(p_memHandle);
+#else
+    for (size_t i = 0; i < num_chunks; ++i) {
+      free(p_memHandle[i]);
+    }
+    free(p_memHandle);
+#endif
+    return nullptr;
+  }

  return (void*)d_mem;
 }
@@ -206,36 +416,96 @@ void my_free(void* ptr, ssize_t size, int device, CUstream stream) {

  if (!py_result || !PyTuple_Check(py_result) || PyTuple_Size(py_result) != 4) {
    PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
+    Py_XDECREF(py_result);
+    Py_XDECREF(py_ptr);
    return;
  }

  unsigned long long recv_device, recv_size;
-  unsigned long long recv_d_mem, recv_p_memHandle;
+  unsigned long long recv_d_mem;
+#ifndef USE_ROCM
+  unsigned long long recv_p_memHandle;
+#else
+  PyObject* recv_p_memHandle;
+#endif
  // Unpack the tuple into four C integers
-  if (!PyArg_ParseTuple(py_result, "KKKK", &recv_device, &recv_size,
+  if (!PyArg_ParseTuple(py_result, PYARGS_PARSE, &recv_device, &recv_size,
                        &recv_d_mem, &recv_p_memHandle)) {
    // PyArg_ParseTuple sets an error if it fails
+    Py_XDECREF(py_result);
+    Py_XDECREF(py_ptr);
    return;
  }

+  // For ROCm, copy the Python list of (addr,size) pairs into C arrays while
+  // holding the GIL. Then release the GIL and call the unmap/release helper
+  // using the copied arrays. This avoids calling PyList_* APIs without the
+  // GIL (which is undefined behavior and can crash when called from other
+  // threads).
+  CUdeviceptr d_mem = (CUdeviceptr)recv_d_mem;
+#ifdef USE_ROCM
+  Py_ssize_t num_chunks = PyList_Size(recv_p_memHandle);
+  CUmemGenericAllocationHandle** p_memHandle =
+      (CUmemGenericAllocationHandle**)malloc(
+          num_chunks * sizeof(CUmemGenericAllocationHandle*));
+  if (p_memHandle == nullptr) {
+    Py_DECREF(py_ptr);
+    Py_DECREF(py_result);
+    PyGILState_Release(gstate);
+    std::cerr << "ERROR: malloc failed for p_memHandle in my_free."
+              << std::endl;
+    return;
+  }
+  unsigned long long* chunk_sizes =
+      (unsigned long long*)malloc(num_chunks * sizeof(unsigned long long));
+  if (chunk_sizes == nullptr) {
+    free(p_memHandle);
+    Py_DECREF(py_ptr);
+    Py_DECREF(py_result);
+    PyGILState_Release(gstate);
+    std::cerr << "ERROR: malloc failed for chunk_sizes in my_free."
+              << std::endl;
+    return;
+  }
+  for (Py_ssize_t i = 0; i < num_chunks; ++i) {
+    PyObject* item = PyList_GetItem(recv_p_memHandle, i);
+    PyObject* addr_py = PyTuple_GetItem(item, 0);
+    PyObject* size_py = PyTuple_GetItem(item, 1);
+    p_memHandle[i] =
+        (CUmemGenericAllocationHandle*)PyLong_AsUnsignedLongLong(addr_py);
+    chunk_sizes[i] = (unsigned long long)PyLong_AsUnsignedLongLong(size_py);
+  }
+
+  // Drop temporary Python refs, then release the GIL before calling into
+  // non-Python APIs.
+  Py_DECREF(py_ptr);
+  Py_DECREF(py_result);
  PyGILState_Release(gstate);

-  // recv_size == size
-  // recv_device == device
+  unmap_and_release(device, size, d_mem, p_memHandle, chunk_sizes, num_chunks);
+#else
+  // Non-ROCm path: simple integer handle already extracted; drop temporary
+  // Python refs while still holding the GIL, then release it.
+  Py_DECREF(py_ptr);
+  Py_DECREF(py_result);
+  PyGILState_Release(gstate);

-  // Free memory
-
-  CUdeviceptr d_mem = (CUdeviceptr)recv_d_mem;
  CUmemGenericAllocationHandle* p_memHandle =
      (CUmemGenericAllocationHandle*)recv_p_memHandle;
  unmap_and_release(device, size, d_mem, p_memHandle);
+#endif

  // free address and the handle
  CUDA_CHECK(cuMemAddressFree(d_mem, size));
-  if (error_code != 0) {
-    return;
+#ifndef USE_ROCM
+  free(p_memHandle);
+#else
+  for (auto i = 0; i < num_chunks; ++i) {
+    free(p_memHandle[i]);
  }
  free(p_memHandle);
+  free(chunk_sizes);
+#endif
 }

 // ---------------------------------------------------------------------------
@@ -271,19 +541,87 @@ static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) {
  }

  unsigned long long recv_device, recv_size;
-  unsigned long long recv_d_mem, recv_p_memHandle;
+  unsigned long long recv_d_mem;
+#ifndef USE_ROCM
+  unsigned long long recv_p_memHandle;
+#else
+  PyObject* recv_p_memHandle;
+#endif
  // Unpack the tuple into four C integers
-  if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
-                        &recv_p_memHandle)) {
+  if (!PyArg_ParseTuple(args, PYARGS_PARSE, &recv_device, &recv_size,
+                        &recv_d_mem, &recv_p_memHandle)) {
    // PyArg_ParseTuple sets an error if it fails
    return nullptr;
  }

  CUdeviceptr d_mem_ptr = (CUdeviceptr)recv_d_mem;
+#ifndef USE_ROCM
  CUmemGenericAllocationHandle* p_memHandle =
      (CUmemGenericAllocationHandle*)recv_p_memHandle;

  unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle);
+#else
+  if (!PyList_Check(recv_p_memHandle)) {
+    PyErr_SetString(PyExc_TypeError,
+                    "Expected a list for the 4th argument on ROCm");
+    return nullptr;
+  }
+  Py_ssize_t num_chunks = PyList_Size(recv_p_memHandle);
+  if (num_chunks < 0) {
+    return nullptr;  // PyList_Size sets an exception on error.
+  }
+  CUmemGenericAllocationHandle** p_memHandle =
+      (CUmemGenericAllocationHandle**)malloc(
+          num_chunks * sizeof(CUmemGenericAllocationHandle*));
+  if (p_memHandle == nullptr) {
+    PyErr_SetString(PyExc_MemoryError, "malloc failed for p_memHandle");
+    return nullptr;
+  }
+  unsigned long long* chunk_sizes =
+      (unsigned long long*)malloc(num_chunks * sizeof(unsigned long long));
+  if (chunk_sizes == nullptr) {
+    free(p_memHandle);
+    PyErr_SetString(PyExc_MemoryError, "malloc failed for chunk_sizes");
+    return nullptr;
+  }
+  for (Py_ssize_t i = 0; i < num_chunks; ++i) {
+    PyObject* item = PyList_GetItem(recv_p_memHandle, i);
+    if (item == nullptr || !PyTuple_Check(item) || PyTuple_Size(item) != 2) {
+      free(p_memHandle);
+      free(chunk_sizes);
+      PyErr_SetString(
+          PyExc_TypeError,
+          "List items must be tuples of size 2 (handle_addr, size)");
+      return nullptr;
+    }
+    PyObject* addr_py = PyTuple_GetItem(item, 0);
+    PyObject* size_py = PyTuple_GetItem(item, 1);
+    if (addr_py == nullptr || size_py == nullptr) {
+      free(p_memHandle);
+      free(chunk_sizes);
+      return nullptr;  // PyTuple_GetItem sets an exception
+    }
+    p_memHandle[i] =
+        (CUmemGenericAllocationHandle*)PyLong_AsUnsignedLongLong(addr_py);
+    if (PyErr_Occurred()) {
+      free(p_memHandle);
+      free(chunk_sizes);
+      return nullptr;
+    }
+    chunk_sizes[i] = (unsigned long long)PyLong_AsUnsignedLongLong(size_py);
+    if (PyErr_Occurred()) {
+      free(p_memHandle);
+      free(chunk_sizes);
+      return nullptr;
+    }
+  }
+
+  unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle, chunk_sizes,
+                    num_chunks);
+
+  free(p_memHandle);
+  free(chunk_sizes);
+#endif

  if (error_code != 0) {
    error_code = no_error;
@@ -301,19 +639,56 @@ static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
  }

  unsigned long long recv_device, recv_size;
-  unsigned long long recv_d_mem, recv_p_memHandle;
+  unsigned long long recv_d_mem;
+#ifndef USE_ROCM
+  unsigned long long recv_p_memHandle;
+#else
+  PyObject* recv_p_memHandle;
+#endif
  // Unpack the tuple into four C integers
-  if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
-                        &recv_p_memHandle)) {
+  if (!PyArg_ParseTuple(args, PYARGS_PARSE, &recv_device, &recv_size,
+                        &recv_d_mem, &recv_p_memHandle)) {
    // PyArg_ParseTuple sets an error if it fails
    return nullptr;
  }

  CUdeviceptr d_mem_ptr = (CUdeviceptr)recv_d_mem;
+#ifndef USE_ROCM
  CUmemGenericAllocationHandle* p_memHandle =
      (CUmemGenericAllocationHandle*)recv_p_memHandle;

  create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle);
+#else
+  Py_ssize_t num_chunks = PyList_Size(recv_p_memHandle);
+  CUmemGenericAllocationHandle** p_memHandle =
+      (CUmemGenericAllocationHandle**)malloc(
+          num_chunks * sizeof(CUmemGenericAllocationHandle*));
+  if (p_memHandle == nullptr) {
+    PyErr_SetString(PyExc_MemoryError, "malloc failed for p_memHandle");
+    return nullptr;
+  }
+  unsigned long long* chunk_sizes =
+      (unsigned long long*)malloc(num_chunks * sizeof(unsigned long long));
+  if (chunk_sizes == nullptr) {
+    free(p_memHandle);
+    PyErr_SetString(PyExc_MemoryError, "malloc failed for chunk_sizes");
+    return nullptr;
+  }
+  for (auto i = 0; i < num_chunks; ++i) {
+    PyObject* item = PyList_GetItem(recv_p_memHandle, i);
+    PyObject* addr_py = PyTuple_GetItem(item, 0);
+    PyObject* size_py = PyTuple_GetItem(item, 1);
+    p_memHandle[i] =
+        (CUmemGenericAllocationHandle*)PyLong_AsUnsignedLongLong(addr_py);
+    chunk_sizes[i] = PyLong_AsUnsignedLongLong(size_py);
+  }
+
+  create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle, chunk_sizes,
+                 num_chunks);
+
+  free(p_memHandle);
+  free(chunk_sizes);
+#endif

  if (error_code != 0) {
    error_code = no_error;
--- a/csrc/cumem_allocator_compat.h
+++ b/csrc/cumem_allocator_compat.h
@@ -0,0 +1,109 @@
+#pragma once
+
+#ifdef USE_ROCM
+////////////////////////////////////////
+// For compatibility with CUDA and ROCm
+////////////////////////////////////////
+  #include <hip/hip_runtime_api.h>
+
+extern "C" {
+  #ifndef CUDA_SUCCESS
+    #define CUDA_SUCCESS hipSuccess
+  #endif  // CUDA_SUCCESS
+
+// https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html
+typedef unsigned long long CUdevice;
+typedef hipDeviceptr_t CUdeviceptr;
+typedef hipError_t CUresult;
+typedef hipCtx_t CUcontext;
+typedef hipStream_t CUstream;
+typedef hipMemGenericAllocationHandle_t CUmemGenericAllocationHandle;
+typedef hipMemAllocationGranularity_flags CUmemAllocationGranularity_flags;
+typedef hipMemAllocationProp CUmemAllocationProp;
+typedef hipMemAccessDesc CUmemAccessDesc;
+
+  #define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
+  #define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
+  #define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
+  #define CU_MEM_ALLOC_GRANULARITY_MINIMUM hipMemAllocationGranularityMinimum
+
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
+  #define CU_MEM_ALLOCATION_COMP_NONE 0x0
+
+// Error Handling
+// https://docs.nvidia.com/cuda/archive/11.4.4/cuda-driver-api/group__CUDA__ERROR.html
+CUresult cuGetErrorString(CUresult hipError, const char** pStr) {
+  *pStr = hipGetErrorString(hipError);
+  return CUDA_SUCCESS;
+}
+
+// Context Management
+// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html
+CUresult cuCtxGetCurrent(CUcontext* ctx) {
+  // This API is deprecated on the AMD platform, only for equivalent cuCtx
+  // driver API on the NVIDIA platform.
+  return hipCtxGetCurrent(ctx);
+}
+
+CUresult cuCtxSetCurrent(CUcontext ctx) {
+  // This API is deprecated on the AMD platform, only for equivalent cuCtx
+  // driver API on the NVIDIA platform.
+  return hipCtxSetCurrent(ctx);
+}
+
+// Primary Context Management
+// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html
+CUresult cuDevicePrimaryCtxRetain(CUcontext* ctx, CUdevice dev) {
+  return hipDevicePrimaryCtxRetain(ctx, dev);
+}
+
+// Virtual Memory Management
+// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html
+CUresult cuMemAddressFree(CUdeviceptr ptr, size_t size) {
+  return hipMemAddressFree(ptr, size);
+}
+
+CUresult cuMemAddressReserve(CUdeviceptr* ptr, size_t size, size_t alignment,
+                             CUdeviceptr addr, unsigned long long flags) {
+  return hipMemAddressReserve(ptr, size, alignment, addr, flags);
+}
+
+CUresult cuMemCreate(CUmemGenericAllocationHandle* handle, size_t size,
+                     const CUmemAllocationProp* prop,
+                     unsigned long long flags) {
+  return hipMemCreate(handle, size, prop, flags);
+}
+
+CUresult cuMemGetAllocationGranularity(
+    size_t* granularity, const CUmemAllocationProp* prop,
+    CUmemAllocationGranularity_flags option) {
+  return hipMemGetAllocationGranularity(granularity, prop, option);
+}
+
+CUresult cuMemMap(CUdeviceptr dptr, size_t size, size_t offset,
+                  CUmemGenericAllocationHandle handle,
+                  unsigned long long flags) {
+  return hipMemMap(dptr, size, offset, handle, flags);
+}
+
+CUresult cuMemRelease(CUmemGenericAllocationHandle handle) {
+  return hipMemRelease(handle);
+}
+
+CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
+                        const CUmemAccessDesc* desc, size_t count) {
+  return hipMemSetAccess(ptr, size, desc, count);
+}
+
+CUresult cuMemUnmap(CUdeviceptr ptr, size_t size) {
+  return hipMemUnmap(ptr, size);
+}
+}  // extern "C"
+
+#else
+////////////////////////////////////////
+// Import CUDA headers for NVIDIA GPUs
+////////////////////////////////////////
+  #include <cuda_runtime_api.h>
+  #include <cuda.h>
+#endif
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -88,3 +88,53 @@
 #define VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(                                              \
      TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(__VA_ARGS__))
+
+#define VLLM_DISPATCH_VEC_SIZE(VEC_SIZE, ...) \
+  switch (VEC_SIZE) {                         \
+    case 16: {                                \
+      constexpr int vec_size = 16;            \
+      __VA_ARGS__();                          \
+      break;                                  \
+    }                                         \
+    case 8: {                                 \
+      constexpr int vec_size = 8;             \
+      __VA_ARGS__();                          \
+      break;                                  \
+    }                                         \
+    case 4: {                                 \
+      constexpr int vec_size = 4;             \
+      __VA_ARGS__();                          \
+      break;                                  \
+    }                                         \
+    case 2: {                                 \
+      constexpr int vec_size = 2;             \
+      __VA_ARGS__();                          \
+      break;                                  \
+    }                                         \
+    default: {                                \
+      constexpr int vec_size = 1;             \
+      __VA_ARGS__();                          \
+      break;                                  \
+    }                                         \
+  }
+
+#define VLLM_DISPATCH_RANK234(NUM_DIMS, ...)                                   \
+  switch (NUM_DIMS) {                                                          \
+    case 2: {                                                                  \
+      constexpr int tensor_rank = 2;                                           \
+      __VA_ARGS__();                                                           \
+      break;                                                                   \
+    }                                                                          \
+    case 3: {                                                                  \
+      constexpr int tensor_rank = 3;                                           \
+      __VA_ARGS__();                                                           \
+      break;                                                                   \
+    }                                                                          \
+    case 4: {                                                                  \
+      constexpr int tensor_rank = 4;                                           \
+      __VA_ARGS__();                                                           \
+      break;                                                                   \
+    }                                                                          \
+    default:                                                                   \
+      TORCH_CHECK(false, "Expects rank 2, 3 or 4 tensors but got ", NUM_DIMS); \
+  }
--- a/csrc/fused_qknorm_rope_kernel.cu
+++ b/csrc/fused_qknorm_rope_kernel.cu
@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cmath>
+#include <cuda_runtime.h>
+#include <type_traits>
+
+#include <torch/cuda.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+#include "type_convert.cuh"
+
+#define CHECK_TYPE(x, st)                                              \
+  TORCH_CHECK(x.scalar_type() == st, #x " dtype is ", x.scalar_type(), \
+              ", while ", st, " is expected")
+#define CHECK_TH_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_TH_CUDA(x);    \
+  CHECK_CONTIGUOUS(x)
+
+#ifdef USE_ROCM
+  #define FINAL_MASK 0xffffffffffffffffULL
+
+  #if defined(HIP_VERSION) && HIP_VERSION < 70000000
+// On ROCm versions before 7.0, __syncwarp isn't defined. The below
+// implementation is copy/pasted from the implementation in ROCm 7.0
+__device__ inline void __syncwarp() {
+  __builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront");
+  __builtin_amdgcn_wave_barrier();
+  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront");
+}
+  #endif
+#else
+  #define FINAL_MASK 0xffffffff
+#endif
+
+namespace tensorrt_llm::common {
+template <typename T, int num>
+struct packed_as;
+// Specialization for packed_as used in this kernel.
+template <>
+struct packed_as<uint, 1> {
+  using type = uint;
+};
+
+template <>
+struct packed_as<uint, 2> {
+  using type = uint2;
+};
+
+template <>
+struct packed_as<uint, 4> {
+  using type = uint4;
+};
+
+template <typename T>
+__inline__ __device__ T warpReduceSum(T val) {
+#pragma unroll
+  for (int mask = 16; mask > 0; mask >>= 1)
+    val += __shfl_xor_sync(FINAL_MASK, val, mask, 32);
+  return val;
+}
+
+template <typename T>
+inline __device__ __host__ T divUp(T m, T n) {
+  return (m + n - 1) / n;
+}
+
+}  // namespace tensorrt_llm::common
+
+namespace tensorrt_llm::kernels {
+// NOTE(zhuhaoran): This kernel is adapted from TensorRT-LLM implementation,
+// with added support for passing the cos_sin_cache as an input.
+// https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.cu
+
+// Perform per-head QK Norm and RoPE in a single kernel.
+// scalar_t_in: data type of QKV and RMSNorm weights
+// scalar_t_cache: data type of cos/sin cache
+// head_dim: the dimension of each head
+// interleave: interleave=!is_neox.
+template <typename scalar_t_in, typename scalar_t_cache, int head_dim,
+          bool interleave>
+__global__ void fusedQKNormRopeKernel(
+    void* qkv_void,                  // Combined QKV tensor
+    int const num_heads_q,           // Number of query heads
+    int const num_heads_k,           // Number of key heads
+    int const num_heads_v,           // Number of value heads
+    float const eps,                 // Epsilon for RMS normalization
+    void const* q_weight_void,       // RMSNorm weights for query
+    void const* k_weight_void,       // RMSNorm weights for key
+    void const* cos_sin_cache_void,  // Pre-computed cos/sin cache
+    int64_t const* position_ids,     // Position IDs for RoPE
+    int const num_tokens             // Number of tokens
+) {
+#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM)
+  if constexpr ((std::is_same_v<scalar_t_in, c10::BFloat16>) ||
+                std::is_same_v<scalar_t_cache, c10::BFloat16>) {
+    return;
+  } else {
+#endif
+
+    using Converter = vllm::_typeConvert<scalar_t_in>;
+    static_assert(Converter::exists,
+                  "Input QKV data type is not supported for this CUDA "
+                  "architecture or toolkit version.");
+    using T_in = typename Converter::hip_type;
+    using T2_in = typename Converter::packed_hip_type;
+
+    using CacheConverter = vllm::_typeConvert<scalar_t_cache>;
+    static_assert(CacheConverter::exists,
+                  "Cache data type is not supported for this CUDA architecture "
+                  "or toolkit version.");
+    using T_cache = typename CacheConverter::hip_type;
+
+    T_in* qkv = reinterpret_cast<T_in*>(qkv_void);
+    T_in const* q_weight = reinterpret_cast<T_in const*>(q_weight_void);
+    T_in const* k_weight = reinterpret_cast<T_in const*>(k_weight_void);
+    T_cache const* cos_sin_cache =
+        reinterpret_cast<T_cache const*>(cos_sin_cache_void);
+
+    int const warpsPerBlock = blockDim.x / 32;
+    int const warpId = threadIdx.x / 32;
+    int const laneId = threadIdx.x % 32;
+
+    // Calculate global warp index to determine which head/token this warp
+    // processes
+    int const globalWarpIdx = blockIdx.x * warpsPerBlock + warpId;
+
+    // Total number of attention heads (Q and K)
+    int const total_qk_heads = num_heads_q + num_heads_k;
+
+    // Determine which token and head type (Q or K) this warp processes
+    int const tokenIdx = globalWarpIdx / total_qk_heads;
+    int const localHeadIdx = globalWarpIdx % total_qk_heads;
+
+    // Skip if this warp is assigned beyond the number of tokens
+    if (tokenIdx >= num_tokens) return;
+
+    bool const isQ = localHeadIdx < num_heads_q;
+    int const headIdx = isQ ? localHeadIdx : localHeadIdx - num_heads_q;
+
+    int const num_heads = num_heads_q + num_heads_k + num_heads_v;
+
+    static_assert(head_dim % (32 * 2) == 0,
+                  "head_dim must be divisible by 64 (each warp processes one "
+                  "head, and each thread gets even number of "
+                  "elements)");
+    constexpr int numElemsPerThread = head_dim / 32;
+    float elements[numElemsPerThread];
+    constexpr int elemSizeBytes = numElemsPerThread * sizeof(__nv_bfloat16);
+    static_assert(elemSizeBytes % 4 == 0,
+                  "numSizeBytes must be a multiple of 4");
+    constexpr int vecSize =
+        elemSizeBytes /
+        4;  // Use packed_as<uint, vecSize> to perform loading/saving.
+    using vec_T = typename tensorrt_llm::common::packed_as<uint, vecSize>::type;
+
+    int offsetWarp;  // Offset for the warp
+    if (isQ) {
+      // Q segment: token offset + head offset within Q segment
+      offsetWarp = tokenIdx * num_heads * head_dim + headIdx * head_dim;
+    } else {
+      // K segment: token offset + entire Q segment + head offset within K
+      // segment
+      offsetWarp = tokenIdx * num_heads * head_dim + num_heads_q * head_dim +
+                   headIdx * head_dim;
+    }
+    int offsetThread = offsetWarp + laneId * numElemsPerThread;
+
+    // Sum of squares for RMSNorm
+    float sumOfSquares = 0.0f;
+
+    // Load.
+    {
+      vec_T vec = *reinterpret_cast<vec_T const*>(&qkv[offsetThread]);
+      constexpr int num_packed_elems = elemSizeBytes / sizeof(T2_in);
+#pragma unroll
+      for (int i = 0; i < num_packed_elems; i++) {
+        // Interpret the generic vector chunk as the specific packed type
+        T2_in packed_val = *(reinterpret_cast<T2_in*>(&vec) + i);
+        // Convert to float2 for computation
+        float2 vals = Converter::convert(packed_val);
+        sumOfSquares += vals.x * vals.x;
+        sumOfSquares += vals.y * vals.y;
+
+        elements[2 * i] = vals.x;
+        elements[2 * i + 1] = vals.y;
+      }
+    }
+
+    // Reduce sum across warp using the utility function
+    sumOfSquares = tensorrt_llm::common::warpReduceSum(sumOfSquares);
+
+    // Compute RMS normalization factor
+    float rms_rcp = rsqrtf(sumOfSquares / static_cast<float>(head_dim) + eps);
+
+    // Normalize elements
+#pragma unroll
+    for (int i = 0; i < numElemsPerThread; i++) {
+      int dim = laneId * numElemsPerThread + i;
+      float weight = isQ ? Converter::convert(q_weight[dim])
+                         : Converter::convert(k_weight[dim]);
+      elements[i] *= rms_rcp * weight;
+    }
+
+    // Apply RoPE to normalized elements
+    float elements2[numElemsPerThread];  // Additional buffer required for RoPE.
+
+    int64_t pos_id = position_ids[tokenIdx];
+
+    // Calculate cache pointer for this position - similar to
+    // pos_encoding_kernels.cu
+    T_cache const* cache_ptr = cos_sin_cache + pos_id * head_dim;
+    int const embed_dim = head_dim / 2;
+    T_cache const* cos_ptr = cache_ptr;
+    T_cache const* sin_ptr = cache_ptr + embed_dim;
+
+    if constexpr (interleave) {
+      // Perform interleaving. Use pre-computed cos/sin values.
+#pragma unroll
+      for (int i = 0; i < numElemsPerThread / 2; ++i) {
+        int const idx0 = 2 * i;
+        int const idx1 = 2 * i + 1;
+
+        float const val0 = elements[idx0];
+        float const val1 = elements[idx1];
+
+        int const dim_idx = laneId * numElemsPerThread + idx0;
+        int const half_dim = dim_idx / 2;
+        float const cos_val =
+            CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
+        float const sin_val =
+            CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
+
+        elements[idx0] = val0 * cos_val - val1 * sin_val;
+        elements[idx1] = val0 * sin_val + val1 * cos_val;
+      }
+    } else {
+      // Before data exchange with in warp, we need to sync.
+      __syncwarp();
+      // Get the data from the other half of the warp. Use pre-computed cos/sin
+      // values.
+#pragma unroll
+      for (int i = 0; i < numElemsPerThread; i++) {
+        elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], 16);
+        if (laneId < 16) {
+          elements2[i] = -elements2[i];
+        }
+
+        int dim_idx = laneId * numElemsPerThread + i;
+        dim_idx = (dim_idx * 2) % head_dim;
+        int half_dim = dim_idx / 2;
+        // Use pre-computed cos/sin from cache
+        float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
+        float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
+
+        elements[i] = elements[i] * cos_val + elements2[i] * sin_val;
+      }
+      // __shfl_xor_sync does not provide memfence. Need to sync again.
+      __syncwarp();
+    }
+
+    // Store.
+    {
+      vec_T vec;
+      constexpr int num_packed_elems = elemSizeBytes / sizeof(T2_in);
+#pragma unroll
+      for (int i = 0; i < num_packed_elems; i++) {
+        // Convert from float2 back to the specific packed type
+        T2_in packed_val = Converter::convert(
+            make_float2(elements[2 * i], elements[2 * i + 1]));
+        // Place it into the generic vector
+        *(reinterpret_cast<T2_in*>(&vec) + i) = packed_val;
+      }
+      *reinterpret_cast<vec_T*>(&qkv[offsetThread]) = vec;
+    }
+
+#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM)
+  }
+#endif
+}
+
+// Borrowed from
+// https://github.com/flashinfer-ai/flashinfer/blob/8125d079a43e9a0ba463a4ed1b639cefd084cec9/include/flashinfer/pos_enc.cuh#L568
+#define DISPATCH_INTERLEAVE(interleave, INTERLEAVE, ...) \
+  if (interleave) {                                      \
+    const bool INTERLEAVE = true;                        \
+    __VA_ARGS__                                          \
+  } else {                                               \
+    const bool INTERLEAVE = false;                       \
+    __VA_ARGS__                                          \
+  }
+
+template <typename scalar_t_in, typename scalar_t_cache>
+void launchFusedQKNormRope(void* qkv, int const num_tokens,
+                           int const num_heads_q, int const num_heads_k,
+                           int const num_heads_v, int const head_dim,
+                           float const eps, void const* q_weight,
+                           void const* k_weight, void const* cos_sin_cache,
+                           bool const interleave, int64_t const* position_ids,
+                           cudaStream_t stream) {
+  constexpr int blockSize = 256;
+
+  int const warpsPerBlock = blockSize / 32;
+  int const totalQKHeads = num_heads_q + num_heads_k;
+  int const totalWarps = num_tokens * totalQKHeads;
+
+  int const gridSize = common::divUp(totalWarps, warpsPerBlock);
+  dim3 gridDim(gridSize);
+  dim3 blockDim(blockSize);
+
+  switch (head_dim) {
+    case 64:
+      DISPATCH_INTERLEAVE(interleave, INTERLEAVE, {
+        fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 64, INTERLEAVE>
+            <<<gridDim, blockDim, 0, stream>>>(
+                qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
+                k_weight, cos_sin_cache, position_ids, num_tokens);
+      });
+      break;
+    case 128:
+      DISPATCH_INTERLEAVE(interleave, INTERLEAVE, {
+        fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 128, INTERLEAVE>
+            <<<gridDim, blockDim, 0, stream>>>(
+                qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
+                k_weight, cos_sin_cache, position_ids, num_tokens);
+      });
+      break;
+    case 256:
+      DISPATCH_INTERLEAVE(interleave, INTERLEAVE, {
+        fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 256, INTERLEAVE>
+            <<<gridDim, blockDim, 0, stream>>>(
+                qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
+                k_weight, cos_sin_cache, position_ids, num_tokens);
+      });
+      break;
+    default:
+      TORCH_CHECK(false,
+                  "Unsupported head dimension for fusedQKNormRope: ", head_dim);
+  }
+}
+}  // namespace tensorrt_llm::kernels
+
+void fused_qk_norm_rope(
+    torch::Tensor& qkv,       // Combined QKV tensor [num_tokens,
+                              // (num_heads_q+num_heads_k+num_heads_v)*head_dim]
+    int64_t num_heads_q,      // Number of query heads
+    int64_t num_heads_k,      // Number of key heads
+    int64_t num_heads_v,      // Number of value heads
+    int64_t head_dim,         // Dimension per head
+    double eps,               // Epsilon for RMS normalization
+    torch::Tensor& q_weight,  // RMSNorm weights for query [head_dim]
+    torch::Tensor& k_weight,  // RMSNorm weights for key [head_dim]
+    torch::Tensor& cos_sin_cache,  // Cos/sin cache [max_position, head_dim]
+    bool is_neox,                  // Whether RoPE is applied in Neox style
+    torch::Tensor& position_ids    // Position IDs for RoPE [num_tokens]
+) {
+  // Input validation
+  CHECK_INPUT(qkv);
+  CHECK_INPUT(position_ids);
+  CHECK_INPUT(q_weight);
+  CHECK_INPUT(k_weight);
+  CHECK_INPUT(cos_sin_cache);
+  CHECK_TYPE(position_ids, torch::kInt64);
+
+  TORCH_CHECK(qkv.dim() == 2,
+              "QKV tensor must be 2D: [num_tokens, "
+              "(num_heads_q+num_heads_k+num_heads_v)*head_dim]");
+  TORCH_CHECK(position_ids.dim() == 1, "Position IDs must be 1D: [num_tokens]");
+  TORCH_CHECK(q_weight.dim() == 1, "Query weights must be 1D: [head_dim]");
+  TORCH_CHECK(k_weight.dim() == 1, "Key weights must be 1D: [head_dim]");
+  TORCH_CHECK(cos_sin_cache.dim() == 2,
+              "Cos/sin cache must be 2D: [max_position, head_dim]");
+  TORCH_CHECK(q_weight.size(0) == head_dim,
+              "Query weights size must match head dimension");
+  TORCH_CHECK(k_weight.size(0) == head_dim,
+              "Key weights size must match head dimension");
+  TORCH_CHECK(cos_sin_cache.size(1) == head_dim,
+              "Cos/sin cache dimension must match head_dim");
+  TORCH_CHECK(qkv.scalar_type() == q_weight.scalar_type() &&
+                  qkv.scalar_type() == k_weight.scalar_type(),
+              "qkv, q_weight and k_weight must have the same dtype");
+
+  int64_t num_tokens = qkv.size(0);
+  TORCH_CHECK(position_ids.size(0) == num_tokens,
+              "Number of tokens in position_ids must match QKV");
+
+  int64_t total_heads = num_heads_q + num_heads_k + num_heads_v;
+  TORCH_CHECK(
+      qkv.size(1) == total_heads * head_dim,
+      "QKV tensor size must match total number of heads and head dimension");
+
+  auto stream = at::cuda::getCurrentCUDAStream(qkv.get_device());
+
+  VLLM_DISPATCH_HALF_TYPES(qkv.scalar_type(), "fused_qk_norm_rope_kernel", [&] {
+    using qkv_scalar_t = scalar_t;
+    VLLM_DISPATCH_FLOATING_TYPES(
+        cos_sin_cache.scalar_type(), "fused_qk_norm_rope_kernel", [&] {
+          using cache_scalar_t = scalar_t;
+          tensorrt_llm::kernels::launchFusedQKNormRope<qkv_scalar_t,
+                                                       cache_scalar_t>(
+              qkv.data_ptr(), static_cast<int>(num_tokens),
+              static_cast<int>(num_heads_q), static_cast<int>(num_heads_k),
+              static_cast<int>(num_heads_v), static_cast<int>(head_dim),
+              static_cast<float>(eps), q_weight.data_ptr(), k_weight.data_ptr(),
+              cos_sin_cache.data_ptr(), !is_neox,
+              reinterpret_cast<int64_t const*>(position_ids.data_ptr()),
+              stream);
+        });
+  });
+}
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@@ -10,18 +10,39 @@
 namespace vllm {

 // TODO(woosuk): Further optimize this kernel.
-template <typename scalar_t>
+template <typename scalar_t, int VEC_SIZE, int NUM_DIMS>
 __global__ void rms_norm_kernel(
-    scalar_t* __restrict__ out,          // [..., hidden_size]
-    const scalar_t* __restrict__ input,  // [..., hidden_size]
-    const int64_t input_stride,
+    scalar_t* __restrict__ out,           // [..., hidden_size]
+    const scalar_t* __restrict__ input,   // [..., hidden_size]
+    const int64_t input_stride_d2,        // input.stride(-2)
+    const int64_t input_stride_d3,        // input.stride(-3)
+    const int64_t input_stride_d4,        // input.stride(-4)
+    const int64_t input_shape_d2,         // input.size(-2)
+    const int64_t input_shape_d3,         // input.size(-3)
    const scalar_t* __restrict__ weight,  // [hidden_size]
    const float epsilon, const int num_tokens, const int hidden_size) {
  __shared__ float s_variance;
  float variance = 0.0f;
-  const scalar_t* input_row = input + blockIdx.x * input_stride;
+  const scalar_t* input_row;
+  if constexpr (NUM_DIMS == 2) {
+    // 2D for layernorm normal case [batch_size, hidden]
+    input_row = input + blockIdx.x * input_stride_d2;
+  } else if constexpr (NUM_DIMS == 3) {
+    // 3D for q/k norm [batch_size, num_heads, head_size]
+    int batch_idx = blockIdx.x / input_shape_d2;
+    int head_idx = blockIdx.x % input_shape_d2;
+    input_row =
+        input + batch_idx * input_stride_d3 + head_idx * input_stride_d2;
+  } else if constexpr (NUM_DIMS == 4) {
+    // 4D for transformers model_impl qk norm [batch, seq, head, head_dim]
+    int batch_idx = blockIdx.x / (input_shape_d3 * input_shape_d2);
+    int remaining = blockIdx.x % (input_shape_d3 * input_shape_d2);
+    int seq_idx = remaining / input_shape_d2;
+    int head_idx = remaining % input_shape_d2;
+    input_row = input + batch_idx * input_stride_d4 +
+                seq_idx * input_stride_d3 + head_idx * input_stride_d2;
+  }

-  constexpr int VEC_SIZE = 8;
  auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
 #pragma unroll
    for (int i = 0; i < VEC_SIZE; ++i) {
@@ -45,10 +66,20 @@ __global__ void rms_norm_kernel(
  }
  __syncthreads();

-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    float x = (float)input[blockIdx.x * input_stride + idx];
-    out[blockIdx.x * hidden_size + idx] =
-        ((scalar_t)(x * s_variance)) * weight[idx];
+  scalar_t* out_row = out + blockIdx.x * hidden_size;
+  auto* v_in = reinterpret_cast<const vec_n_t<scalar_t, VEC_SIZE>*>(input_row);
+  auto* v_w = reinterpret_cast<const vec_n_t<scalar_t, VEC_SIZE>*>(weight);
+  auto* v_out = reinterpret_cast<vec_n_t<scalar_t, VEC_SIZE>*>(out_row);
+  for (int i = threadIdx.x; i < hidden_size / VEC_SIZE; i += blockDim.x) {
+    vec_n_t<scalar_t, VEC_SIZE> dst;
+    vec_n_t<scalar_t, VEC_SIZE> src1 = v_in[i];
+    vec_n_t<scalar_t, VEC_SIZE> src2 = v_w[i];
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; j++) {
+      float x = static_cast<float>(src1.val[j]);
+      dst.val[j] = ((scalar_t)(x * s_variance)) * src2.val[j];
+    }
+    v_out[i] = dst;
  }
 }

@@ -155,30 +186,44 @@ void rms_norm(torch::Tensor& out,     // [..., hidden_size]
              torch::Tensor& weight,  // [hidden_size]
              double epsilon) {
  TORCH_CHECK(out.is_contiguous());
+  if (input.stride(-1) != 1) {
+    input = input.contiguous();
+  }
  TORCH_CHECK(input.stride(-1) == 1);
  TORCH_CHECK(weight.is_contiguous());

  int hidden_size = input.size(-1);

-  // We cannot just use `input.stride(-2)` if the tensor is not row-major.
-  // Instead, we use a 2d view to get the second-innermost stride.
-  // That way the dimensions (except the last one) can be arbitrarily permuted.
-  torch::Tensor input_view = input.view({-1, hidden_size});
-
-  int num_tokens = input_view.numel() / hidden_size;
-  int64_t input_stride = input_view.stride(-2);
+  int num_tokens = input.numel() / hidden_size;
+  int num_dims = input.dim();
+  int64_t input_stride_d2 = input.stride(-2);
+  int64_t input_stride_d3 = (num_dims >= 3) ? input.stride(-3) : 0;
+  int64_t input_stride_d4 = (num_dims >= 4) ? input.stride(-4) : 0;
+  int64_t input_shape_d2 = (num_dims >= 3) ? input.size(-2) : 0;
+  int64_t input_shape_d3 = (num_dims >= 4) ? input.size(-3) : 0;

+  // For large num_tokens, use smaller blocks to increase SM concurrency.
+  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
  dim3 grid(num_tokens);
-  dim3 block(std::min(hidden_size, 1024));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input_view));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(
-      input_view.scalar_type(), "rms_norm_kernel", [&] {
-        vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
-            out.data_ptr<scalar_t>(), input_view.data_ptr<scalar_t>(),
-            input_stride, weight.data_ptr<scalar_t>(), epsilon, num_tokens,
-            hidden_size);
+  VLLM_DISPATCH_RANK234(num_dims, [&] {
+    VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
+      const int calculated_vec_size =
+          std::gcd(16 / sizeof(scalar_t), hidden_size);
+      const int block_size =
+          std::min(hidden_size / calculated_vec_size, max_block_size);
+      dim3 block(block_size);
+      VLLM_DISPATCH_VEC_SIZE(calculated_vec_size, [&] {
+        vllm::rms_norm_kernel<scalar_t, vec_size, tensor_rank>
+            <<<grid, block, 0, stream>>>(
+                out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
+                input_stride_d2, input_stride_d3, input_stride_d4,
+                input_shape_d2, input_shape_d3, weight.data_ptr<scalar_t>(),
+                epsilon, num_tokens, hidden_size);
      });
+    });
+  });
 }

 #define LAUNCH_FUSED_ADD_RMS_NORM(width)                                    \
--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@@ -18,7 +18,7 @@
 namespace vllm {

 // TODO(woosuk): Further optimize this kernel.
-template <typename scalar_t, typename fp8_type>
+template <typename scalar_t, typename fp8_type, int VEC_SIZE>
 __global__ void rms_norm_static_fp8_quant_kernel(
    fp8_type* __restrict__ out,          // [..., hidden_size]
    const scalar_t* __restrict__ input,  // [..., hidden_size]
@@ -31,7 +31,6 @@ __global__ void rms_norm_static_fp8_quant_kernel(

  const scalar_t* input_row = input + blockIdx.x * input_stride;

-  constexpr int VEC_SIZE = 8;
  auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
 #pragma unroll
    for (int i = 0; i < VEC_SIZE; ++i) {
@@ -58,11 +57,18 @@ __global__ void rms_norm_static_fp8_quant_kernel(
  // invert scale to avoid division
  float const scale_inv = 1.0f / *scale;

-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    float x = (float)input[blockIdx.x * input_stride + idx];
-    float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx];
-    out[blockIdx.x * hidden_size + idx] =
-        scaled_fp8_conversion<true, fp8_type>(out_norm, scale_inv);
+  auto* v_in = reinterpret_cast<const vec_n_t<scalar_t, VEC_SIZE>*>(input_row);
+  auto* v_w = reinterpret_cast<const vec_n_t<scalar_t, VEC_SIZE>*>(weight);
+  for (int idx = threadIdx.x; idx < hidden_size / VEC_SIZE; idx += blockDim.x) {
+    vec_n_t<scalar_t, VEC_SIZE> src1 = v_in[idx];
+    vec_n_t<scalar_t, VEC_SIZE> src2 = v_w[idx];
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; j++) {
+      float x = static_cast<float>(src1.val[j]);
+      float const out_norm = ((scalar_t)(x * s_variance)) * src2.val[j];
+      out[blockIdx.x * hidden_size + idx * VEC_SIZE + j] =
+          scaled_fp8_conversion<true, fp8_type>(out_norm, scale_inv);
+    }
  }
 }

@@ -188,20 +194,29 @@ void rms_norm_static_fp8_quant(torch::Tensor& out,     // [..., hidden_size]
  int input_stride = input.stride(-2);
  int num_tokens = input.numel() / hidden_size;

+  // For large num_tokens, use smaller blocks to increase SM concurrency.
+  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
  dim3 grid(num_tokens);
-  dim3 block(std::min(hidden_size, 1024));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "rms_norm_kernel_scalar_type", [&] {
        VLLM_DISPATCH_FP8_TYPES(
            out.scalar_type(), "rms_norm_kernel_fp8_type", [&] {
-              vllm::rms_norm_static_fp8_quant_kernel<scalar_t, fp8_t>
-                  <<<grid, block, 0, stream>>>(
-                      out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
-                      input_stride, weight.data_ptr<scalar_t>(),
-                      scale.data_ptr<float>(), epsilon, num_tokens,
-                      hidden_size);
+              const int calculated_vec_size =
+                  std::gcd(16 / sizeof(scalar_t), hidden_size);
+              const int block_size =
+                  std::min(hidden_size / calculated_vec_size, max_block_size);
+              dim3 block(block_size);
+              VLLM_DISPATCH_VEC_SIZE(calculated_vec_size, [&] {
+                vllm::rms_norm_static_fp8_quant_kernel<scalar_t, fp8_t,
+                                                       vec_size>
+                    <<<grid, block, 0, stream>>>(
+                        out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
+                        input_stride, weight.data_ptr<scalar_t>(),
+                        scale.data_ptr<float>(), epsilon, num_tokens,
+                        hidden_size);
+              });
            });
      });
 }
--- a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
+++ b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
@@ -93,16 +93,16 @@ torch::Tensor dynamic_4bit_int_moe_cpu(
  }
  auto Y_all = at::empty({offsets[E], H}, x_c.options());

-  at::parallel_for(0, E, 1, [&](int64_t e_begin, int64_t e_end) {
+  at::parallel_for(0, offsets[E], 0, [&](int64_t idx_begin, int64_t idx_end) {
    c10::InferenceMode guard;
-    for (int64_t e = e_begin; e < e_end; ++e) {
-      const int64_t te = counts[e];
-      if (te == 0) {
+    for (int64_t e = 0; e < E; ++e) {
+      int64_t start = std::max(offsets[e], idx_begin);
+      int64_t end = std::min(offsets[e + 1], idx_end);
+      int64_t te = end - start;
+      if (te <= 0) {
        continue;
      }

-      const int64_t start = offsets[e];
-
      auto x_e = X_all.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);

      auto w13_e = w13_packed.select(/*dim=*/0, e);
--- a/csrc/moe/marlin_moe_wna16/.gitignore
+++ b/csrc/moe/marlin_moe_wna16/.gitignore
@@ -1 +1,2 @@
-kernel_*.cu
+sm*_kernel_*.cu
+kernel_selector.h
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@@ -4,134 +4,282 @@ import glob
 import itertools
 import os
 import subprocess
+import sys

 import jinja2

-FILE_HEAD = """
-// auto generated by generate.py
-// clang-format off
+ARCHS = []
+SUPPORT_FP8 = False
+for arch in sys.argv[1].split(","):
+    arch = arch[: arch.index(".") + 2].replace(".", "")
+    arch = int(arch)
+    # only SM89 and SM120 fully support
+    # mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32.
+    # SM90 and SM100 can use this PTX, but it’s simulated
+    # with FP16 MMA, so it cannot achieve any acceleration.
+    if arch in [89, 120]:
+        SUPPORT_FP8 = True

+FILE_HEAD_COMMENT = """
+// auto generated by generate_kernels.py
+// clang-format off
+""".lstrip()
+
+FILE_HEAD = (
+    FILE_HEAD_COMMENT
+    + """
 #include "kernel.h"
 #include "marlin_template.h"

 namespace MARLIN_NAMESPACE_NAME {
-""".strip()
+"""
+)

 TEMPLATE = (
    "template __global__ void Marlin<"
-    "{{scalar_t}}, "
-    "{{w_type_id}}, "
+    "{{a_type_id}}, "
+    "{{b_type_id}}, "
+    "{{c_type_id}}, "
    "{{s_type_id}}, "
    "{{threads}}, "
    "{{thread_m_blocks}}, "
    "{{thread_n_blocks}}, "
    "{{thread_k_blocks}}, "
-    "{{'true' if m_block_size_8 else 'false'}}, "
+    "{{m_block_size_8}}, "
    "{{stages}}, "
    "{{group_blocks}}, "
-    "{{'true' if is_zp_float else 'false'}}>"
+    "{{is_zp_float}}>"
    "( MARLIN_KERNEL_PARAMS );"
 )

-# int8 with zero point case (vllm::kU8) is also supported,
-# we don't add it to reduce wheel size.
-SCALAR_TYPES = [
-    "vllm::kU4",
-    "vllm::kU4B8",
-    "vllm::kU8B128",
-    "vllm::kFE4M3fn",
-    "vllm::kFE2M1f",
-]
 THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]

 THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
-# group_blocks:
-#   = 0 : act order case
-#   = -1 : channelwise quantization
-#   > 0 : group_size=16*group_blocks
-GROUP_BLOCKS = [0, -1, 1, 2, 4, 8]
-DTYPES = ["fp16", "bf16"]
+
+QUANT_CONFIGS = [
+    # AWQ-INT4
+    {
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4
+    {
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 0, 2, 4, 8],
+    },
+    # AWQ-INT8
+    {
+        "b_type": "kU8B128",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 0, 2, 4, 8],
+    },
+    # FP8
+    {
+        "b_type": "kFE4M3fn",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 8],
+    },
+    # NVFP4
+    {
+        "b_type": "kFE2M1f",
+        "s_type": "kFE4M3fn",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [1],
+    },
+    # MXFP4
+    {
+        "a_type": ["kBFloat16"],
+        "b_type": "kFE2M1f",
+        "s_type": "kFE8M0fnu",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [2],
+    },
+    # AWQ-INT4 with INT8 activation
+    {
+        "a_type": ["kS8"],
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with INT8 activation
+    {
+        "a_type": ["kS8"],
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # AWQ-INT4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # MXFP4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kFE2M1f",
+        "c_type": ["kBFloat16"],
+        "s_type": "kFE8M0fnu",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [2],
+    },
+]


 def remove_old_kernels():
-    for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cu"):
+    for filename in glob.glob(os.path.dirname(__file__) + "/*kernel_*.cu"):
        subprocess.call(["rm", "-f", filename])

+    filename = os.path.dirname(__file__) + "/kernel_selector.h"
+    subprocess.call(["rm", "-f", filename])
+

 def generate_new_kernels():
-    for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
+    result_dict = {}
+
+    for quant_config in QUANT_CONFIGS:
+        c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
+        a_types = quant_config.get("a_type", ["kFloat16", "kBFloat16"])
+        b_type = quant_config["b_type"]
+        all_group_blocks = quant_config["group_blocks"]
+        all_m_blocks = quant_config["thread_m_blocks"]
+        all_thread_configs = quant_config["thread_configs"]
+
+        for a_type, c_type in itertools.product(a_types, c_types):
+            if not SUPPORT_FP8 and a_type == "kFE4M3fn":
+                continue
+            if "16" in a_type and "16" in c_type and a_type != c_type:
+                continue
+            s_type = quant_config.get("s_type", c_type)
+            if (a_type, b_type, c_type) not in result_dict:
+                result_dict[(a_type, b_type, c_type)] = []
+
+            for group_blocks, m_blocks, thread_configs in itertools.product(
+                all_group_blocks, all_m_blocks, all_thread_configs
+            ):
+                thread_k, thread_n, threads = thread_configs
+
+                if threads == 256:
+                    # for small batch (m_blocks == 1),
+                    #     we only need (128, 128, 256)
+                    # for large batch (m_blocks > 1),
+                    #     we only need (64, 256, 256)
+                    if m_blocks <= 1 and (thread_k, thread_n) != (128, 128):
+                        continue
+                    if m_blocks > 1 and (thread_k, thread_n) != (64, 256):
+                        continue
+
+                config = {
+                    "threads": threads,
+                    "s_type": s_type,
+                    "thread_m_blocks": max(m_blocks, 1),
+                    "thread_k_blocks": thread_k // 16,
+                    "thread_n_blocks": thread_n // 16,
+                    "m_block_size_8": "true" if m_blocks == 0.5 else "false",
+                    "stages": "pipe_stages",
+                    "group_blocks": group_blocks,
+                    "is_zp_float": "false",
+                }
+
+                result_dict[(a_type, b_type, c_type)].append(config)
+
+    kernel_selector_str = FILE_HEAD_COMMENT
+
+    for (a_type, b_type, c_type), config_list in result_dict.items():
        all_template_str_list = []
-
-        for group_blocks, m_blocks, thread_configs in itertools.product(
-            GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
-        ):
-            # act order case only support gptq-int4 and gptq-int8
-            if group_blocks == 0 and scalar_type not in [
-                "vllm::kU4B8",
-                "vllm::kU8B128",
-            ]:
-                continue
-            if thread_configs[2] == 256:
-                # for small batch (m_blocks == 1), we only need (128, 128, 256)
-                # for large batch (m_blocks > 1), we only need (64, 256, 256)
-                if m_blocks <= 1 and thread_configs[0] != 128:
-                    continue
-                if m_blocks > 1 and thread_configs[0] != 64:
-                    continue
-
-            # we only support channelwise quantization and group_size == 128
-            # for fp8
-            if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
-                continue
-            # nvfp4 only supports group_size == 16
-            # mxfp4 only supports group_size == 32
-            if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
-                continue
-            # other quantization methods don't support group_size = 16
-            if scalar_type != "vllm::kFE2M1f" and group_blocks == 1:
-                continue
-
-            k_blocks = thread_configs[0] // 16
-            n_blocks = thread_configs[1] // 16
-            threads = thread_configs[2]
-
-            c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
-
-            if scalar_type == "vllm::kFE2M1f" and group_blocks == 1:
-                s_type = "vllm::kFE4M3fn"
-            elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2:
-                s_type = "vllm::kFE8M0fnu"
-                if dtype == "fp16":
-                    # we cannot safely dequantize e8m0 to fp16, so skip this
-                    continue
-            elif dtype == "fp16":
-                s_type = "vllm::kFloat16"
-            elif dtype == "bf16":
-                s_type = "vllm::kBFloat16"
-
+        for config in config_list:
+            s_type = config["s_type"]
            template_str = jinja2.Template(TEMPLATE).render(
-                scalar_t=c_dtype,
-                w_type_id=scalar_type + ".id()",
-                s_type_id=s_type + ".id()",
-                threads=threads,
-                thread_m_blocks=max(m_blocks, 1),
-                thread_n_blocks=n_blocks,
-                thread_k_blocks=k_blocks,
-                m_block_size_8=m_blocks == 0.5,
-                stages="pipe_stages",
-                group_blocks=group_blocks,
-                is_zp_float=False,
+                a_type_id=f"vllm::{a_type}.id()",
+                b_type_id=f"vllm::{b_type}.id()",
+                c_type_id=f"vllm::{c_type}.id()",
+                s_type_id=f"vllm::{s_type}.id()",
+                **config,
+            )
+            all_template_str_list.append(template_str)
+
+            conditions = [
+                f"a_type == vllm::{a_type}",
+                f"b_type == vllm::{b_type}",
+                f"c_type == vllm::{c_type}",
+                f"s_type == vllm::{s_type}",
+                f"threads == {config['threads']}",
+                f"thread_m_blocks == {config['thread_m_blocks']}",
+                f"thread_n_blocks == {config['thread_n_blocks']}",
+                f"thread_k_blocks == {config['thread_k_blocks']}",
+                f"m_block_size_8 == {config['m_block_size_8']}",
+                f"group_blocks == {config['group_blocks']}",
+                f"is_zp_float == {config['is_zp_float']}",
+            ]
+            conditions = " && ".join(conditions)
+
+            if kernel_selector_str == FILE_HEAD_COMMENT:
+                kernel_selector_str += f"if ({conditions})\n  kernel = "
+            else:
+                kernel_selector_str += f"else if ({conditions})\n  kernel = "
+
+            kernel_template2 = (
+                "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
+                "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
+                "{{thread_n_blocks}}, {{thread_k_blocks}}, "
+                "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
+                "{{is_zp_float}}>;"
            )

-            all_template_str_list.append(template_str)
+            kernel_selector_str += (
+                jinja2.Template(kernel_template2).render(
+                    a_type_id=f"vllm::{a_type}.id()",
+                    b_type_id=f"vllm::{b_type}.id()",
+                    c_type_id=f"vllm::{c_type}.id()",
+                    s_type_id=f"vllm::{s_type}.id()",
+                    **config,
+                )
+                + "\n"
+            )

        file_content = FILE_HEAD + "\n\n"
        file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
-        filename = f"kernel_{dtype}_{scalar_type[6:].lower()}.cu"
+        if a_type == "kFE4M3fn":
+            filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+        else:
+            filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+
+        filename = filename.lower()

        with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
            f.write(file_content)

+    if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
+        kernel_selector_str += (
+            "else if (a_type == vllm::kFE4M3fn)\n"
+            "  TORCH_CHECK(false, "
+            '"marlin kernel with fp8 activation is not built.");'
+        )
+
+    with open(os.path.join(os.path.dirname(__file__), "kernel_selector.h"), "w") as f:
+        f.write(kernel_selector_str)
+

 if __name__ == "__main__":
    remove_old_kernels()
--- a/csrc/moe/marlin_moe_wna16/kernel.h
+++ b/csrc/moe/marlin_moe_wna16/kernel.h
@@ -11,8 +11,9 @@
  const int4 *__restrict__ A, const int4 *__restrict__ B,                     \
      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                         \
      const int4 *__restrict__ b_bias_ptr,                                    \
+      const float *__restrict__ a_scales_ptr,                                 \
      const int4 *__restrict__ scales_ptr,                                    \
-      const uint16_t *__restrict__ scale2_ptr,                                \
+      const uint16_t *__restrict__ global_scale_ptr,                          \
      const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,         \
      const int32_t *__restrict__ sorted_token_ids_ptr,                       \
      const int32_t *__restrict__ expert_ids_ptr,                             \
@@ -20,12 +21,13 @@
      const float *__restrict__ topk_weights_ptr, int top_k,                  \
      bool mul_topk_weights, bool is_ep, int num_groups, int prob_m,          \
      int prob_n, int prob_k, int *locks, bool has_bias, bool use_atomic_add, \
-      bool use_fp32_reduce, int max_shared_mem
+      bool use_fp32_reduce

 namespace MARLIN_NAMESPACE_NAME {
-template <typename scalar_t,  // compute dtype, half or nv_float16
-          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
-          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
+template <const vllm::ScalarTypeId a_type_id,  // A ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // B ScalarType id
+          const vllm::ScalarTypeId c_type_id,  // C ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // B_SCALE ScalarType id
          const int threads,          // number of threads in a threadblock
          const int thread_m_blocks,  // number of 16x16 blocks in the m
                                      // dimension (batchsize) of the
--- a/csrc/moe/marlin_moe_wna16/marlin_template.h
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
--- a/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
@@ -37,39 +37,6 @@ __global__ void MarlinDefault(MARLIN_KERNEL_PARAMS){};

 using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS);

-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
-
-template <int moe_block_size>
-__global__ void permute_cols_kernel(
-    int4 const* __restrict__ a_int4_ptr, int const* __restrict__ perm_int_ptr,
-    int4* __restrict__ out_int4_ptr,
-    const int32_t* __restrict__ sorted_token_ids_ptr,
-    const int32_t* __restrict__ expert_ids_ptr,
-    const int32_t* __restrict__ num_tokens_past_padded_ptr, int size_m,
-    int size_k, int top_k) {};
-
-}  // namespace marlin
-
-torch::Tensor moe_wna16_marlin_gemm(
-    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
-    torch::Tensor& b_q_weight,
-    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
-    std::optional<torch::Tensor> const& b_zeros_or_none,
-    std::optional<torch::Tensor> const& g_idx_or_none,
-    std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
-    torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
-    torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
-    int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
-    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
-    int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
-    bool is_zp_float) {
-  TORCH_CHECK_NOT_IMPLEMENTED(false,
-                              "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
-  return torch::empty({1, 1});
-}
-
-#else
-
 // For a given "a" of size [M,K] performs a permutation of the K columns based
 // on the given "perm" indices.
 template <int moe_block_size>
@@ -207,7 +174,7 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
                          int thread_m_blocks, int prob_m, int prob_n,
                          int prob_k, int num_bits, int group_size,
                          bool has_act_order, bool is_k_full, int has_zp,
-                          int is_zp_float) {
+                          int is_zp_float, bool is_a_8bit) {
  int pack_factor = 32 / num_bits;

  // Get B size
@@ -217,8 +184,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,

  // shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights
  // both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
-  int sh_block_meta_size = tb_m * 4;
-  int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
+  int sh_block_meta_size = tb_m * 16;
+  int sh_a_size = pipe_stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2);
  int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
  int sh_red_size = tb_m * (tb_n + 8) * 2;
  int sh_bias_size = tb_n * 2;
@@ -250,7 +217,7 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
                     int thread_m_blocks, int prob_m, int prob_n, int prob_k,
                     int num_bits, int group_size, bool has_act_order,
                     bool is_k_full, int has_zp, int is_zp_float,
-                     int max_shared_mem) {
+                     int max_shared_mem, bool is_a_8bit) {
  // Sanity
  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
      th_config.num_threads == -1) {
@@ -273,188 +240,34 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
  }

  // Check that pipeline fits into cache
-  int cache_size = get_kernel_cache_size(
-      th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
-      num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float);
-  return cache_size + 512 <= max_shared_mem;
+  int cache_size =
+      get_kernel_cache_size(th_config, m_block_size_8, thread_m_blocks, prob_m,
+                            prob_n, prob_k, num_bits, group_size, has_act_order,
+                            is_k_full, has_zp, is_zp_float, is_a_8bit);
+  return cache_size <= max_shared_mem;
 }

-  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,   \
-                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)      \
-    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&         \
-             thread_n_blocks == THREAD_N_BLOCKS &&                             \
-             thread_k_blocks == THREAD_K_BLOCKS &&                             \
-             m_block_size_8 == M_BLOCK_SIZE_8 &&                               \
-             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&     \
-             is_zp_float == IS_ZP_FLOAT) {                                     \
-      constexpr auto S_TYPE =                                                  \
-          W_TYPE == vllm::kFE2M1f                                              \
-              ? (GROUP_BLOCKS == 1 ? vllm::kFE4M3fn : vllm::kFE8M0fnu)         \
-              : (std::is_same<scalar_t, half>::value ? vllm::kFloat16          \
-                                                     : vllm::kBFloat16);       \
-      kernel = Marlin<scalar_t, W_TYPE.id(), S_TYPE.id(), NUM_THREADS,         \
-                      THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,       \
-                      M_BLOCK_SIZE_8, pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>; \
-    }
-
-  // COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false)
-  //         this is the most common cases
-  // BIGGROUP: cases for big group size (group_blocks in [-1, 8])
-  // FZP: cases for float-zero-point (is_zp_float = true)
-  // ACT: cases for act order case (group_blocks == 0)
-  // FP4: cases for nvfp4(e2m1) (group_blocks == 1)
-  #define COMMON_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define COMMON_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-                                                                          \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-                                                                          \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define COMMON_GET_IF(W_TYPE)            \
-    COMMON_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    COMMON_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    COMMON_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    COMMON_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-  #define BIGGROUP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define BIGGROUP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)   \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define BIGGROUP_GET_IF(W_TYPE)            \
-    BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-  #define NVFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
-
-  #define NVFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
-
-  #define NVFP4_GET_IF(W_TYPE)            \
-    NVFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    NVFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    NVFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    NVFP4_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-  #define MXFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
-
-  #define MXFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
-
-  #define MXFP4_GET_IF(W_TYPE)            \
-    MXFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    MXFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    MXFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    MXFP4_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-  // We currently have 4-bit models only with group_blocks == 4
-  #define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, true) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
-
-  #define FZP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
-
-  #define FZP_GET_IF(W_TYPE)            \
-    FZP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    FZP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    FZP_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    FZP_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-  // We currently have 4-bit models only with group_blocks == 4
-  #define ACT_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)        \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
-
-  #define ACT_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
-
-  #define ACT_GET_IF(W_TYPE)            \
-    ACT_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    ACT_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    ACT_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    ACT_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-template <typename scalar_t>
-MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
-                                int thread_m_blocks, int thread_n_blocks,
-                                int thread_k_blocks, bool m_block_size_8,
-                                bool has_act_order, bool has_zp,
-                                int group_blocks, int num_threads,
-                                bool is_zp_float) {
-  int num_bits = q_type.size_bits();
+MarlinFuncPtr get_marlin_kernel(
+    const vllm::ScalarType a_type, const vllm::ScalarType b_type,
+    const vllm::ScalarType c_type, const vllm::ScalarType s_type,
+    int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
+    bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
+    int threads, bool is_zp_float) {
+  int num_bits = b_type.size_bits();
  auto kernel = MarlinDefault;
-  if (false) {
-  }

-  COMMON_GET_IF(vllm::kU4)
-  COMMON_GET_IF(vllm::kU4B8)
-  COMMON_GET_IF(vllm::kU8B128)
-
-  NVFP4_GET_IF(vllm::kFE2M1f)
-
-  BIGGROUP_GET_IF(vllm::kFE4M3fn)
-
-  ACT_GET_IF(vllm::kU4B8)
-  ACT_GET_IF(vllm::kU8B128)
-  if (std::is_same<scalar_t, nv_bfloat16>::value) {
-    if (false) {
-    }
-    MXFP4_GET_IF(vllm::kFE2M1f)
-  }
+#include "kernel_selector.h"

  return kernel;
 }

-template <typename scalar_t>
-exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
-                                    int prob_n, int prob_k, int thread_m_blocks,
-                                    bool m_block_size_8, int num_bits,
-                                    int group_size, bool has_act_order,
-                                    bool is_k_full, bool has_zp,
-                                    bool is_zp_float, int max_shared_mem) {
+exec_config_t determine_exec_config(
+    const vllm::ScalarType& a_type, const vllm::ScalarType& b_type,
+    const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
+    int prob_n, int prob_k, int num_experts, int top_k, int thread_m_blocks,
+    bool m_block_size_8, int num_bits, int group_size, bool has_act_order,
+    bool is_k_full, bool has_zp, bool is_zp_float, int max_shared_mem, int sms,
+    bool is_a_8bit) {
  exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
  thread_config_t* thread_configs = thread_m_blocks > 1
                                        ? large_batch_thread_configs
@@ -471,73 +284,69 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,

    if (!is_valid_config(th_config, m_block_size_8, thread_m_blocks, prob_m,
                         prob_n, prob_k, num_bits, group_size, has_act_order,
-                         is_k_full, has_zp, is_zp_float, max_shared_mem)) {
+                         is_k_full, has_zp, is_zp_float, max_shared_mem - 512,
+                         is_a_8bit)) {
      continue;
    }

    int cache_size = get_kernel_cache_size(
        th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
-        num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float);
+        num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
+        is_a_8bit);

    int group_blocks = 0;
    if (!has_act_order) {
      group_blocks = group_size == -1 ? -1 : (group_size / 16);
    }

-    auto kernel = get_marlin_kernel<scalar_t>(
-        q_type, thread_m_blocks, th_config.thread_n / 16,
-        th_config.thread_k / 16, m_block_size_8, has_act_order, has_zp,
-        group_blocks, th_config.num_threads, is_zp_float);
+    auto kernel =
+        get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
+                          th_config.thread_n / 16, th_config.thread_k / 16,
+                          m_block_size_8, has_act_order, has_zp, group_blocks,
+                          th_config.num_threads, is_zp_float);

    if (kernel == MarlinDefault) continue;

-    if (thread_m_blocks > 1) {
-      exec_cfg = {1, th_config};
-      break;
-    } else {
-      cudaFuncAttributes attr;
-      cudaFuncGetAttributes(&attr, kernel);
-      int reg_size = max(attr.numRegs, 1) * th_config.num_threads * 4;
-      int allow_count = min(device_max_reg_size / reg_size,
-                            max_shared_mem / (cache_size + 1024));
+    cudaFuncAttributes attr;
+    cudaFuncGetAttributes(&attr, kernel);
+    int reg_size = max(attr.numRegs, 1) * th_config.num_threads * 4;
+    int allow_count = min(device_max_reg_size / reg_size,
+                          max_shared_mem / (cache_size + 1536));
+    if (thread_m_blocks == 1)
      allow_count = max(min(allow_count, 4), 1);
-      if (allow_count > count) {
-        count = allow_count;
-        exec_cfg = {count, th_config};
-      };
+    else
+      allow_count = max(min(allow_count, 2), 1);
+
+    if (prob_n / th_config.thread_n * prob_m * top_k * 4 < sms * allow_count) {
+      allow_count =
+          max(prob_n / th_config.thread_n * prob_m * top_k * 4 / sms, 1);
    }
+
+    if (allow_count > count) {
+      count = allow_count;
+      exec_cfg = {count, th_config};
+    };
  }

  return exec_cfg;
 }

-template <typename scalar_t>
 void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
-               void* s, void* s2, void* zp, void* g_idx, void* perm,
-               void* a_tmp, void* sorted_token_ids, void* expert_ids,
-               void* num_tokens_past_padded, void* topk_weights,
-               int moe_block_size, int top_k, bool mul_topk_weights, bool is_ep,
-               int prob_m, int prob_n, int prob_k, void* workspace,
-               vllm::ScalarType const& q_type, bool has_bias,
-               bool has_act_order, bool is_k_full, bool has_zp, int num_groups,
-               int group_size, int dev, cudaStream_t stream, int thread_k,
-               int thread_n, int sms, bool use_atomic_add, bool use_fp32_reduce,
-               bool is_zp_float) {
+               void* a_s, void* b_s, void* g_s, void* zp, void* g_idx,
+               void* perm, void* a_tmp, void* sorted_token_ids,
+               void* expert_ids, void* num_tokens_past_padded,
+               void* topk_weights, int moe_block_size, int num_experts,
+               int top_k, bool mul_topk_weights, bool is_ep, int prob_m,
+               int prob_n, int prob_k, void* workspace,
+               vllm::ScalarType const& a_type, vllm::ScalarType const& b_type,
+               vllm::ScalarType const& c_type, vllm::ScalarType const& s_type,
+               bool has_bias, bool has_act_order, bool is_k_full, bool has_zp,
+               int num_groups, int group_size, int dev, cudaStream_t stream,
+               int thread_k, int thread_n, int sms, int blocks_per_sm,
+               bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) {
  int thread_m_blocks = div_ceil(moe_block_size, 16);
  bool m_block_size_8 = moe_block_size == 8;
-
-  if (has_zp) {
-    TORCH_CHECK(
-        q_type == vllm::kU4 || q_type == vllm::kU8,
-        "q_type must be u4 or u8 when has_zp = True. Got = ", q_type.str());
-  } else {
-    TORCH_CHECK(
-        q_type == vllm::kU4B8 || q_type == vllm::kU8B128 ||
-            q_type == vllm::kFE4M3fn || q_type == vllm::kFE2M1f,
-        "q_type must be uint4b8, uint8b128, float8_e4m3fn or float4_e2m1f when "
-        "has_zp = False. Got = ",
-        q_type.str());
-  }
+  bool is_a_8bit = a_type.size_bits() == 8;

  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
              ", ", prob_n, ", ", prob_k, "]");
@@ -563,14 +372,15 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
    }
  }

-  int num_bits = q_type.size_bits();
+  int num_bits = b_type.size_bits();
  const int4* A_ptr = (const int4*)A;
  const int4* B_ptr = (const int4*)B;
  int4* C_ptr = (int4*)C;
  int4* C_tmp_ptr = (int4*)C_tmp;
  const int4* bias_ptr = (const int4*)b_bias;
-  const int4* s_ptr = (const int4*)s;
-  const uint16_t* s2_ptr = (const uint16_t*)s2;
+  const float* a_s_ptr = (const float*)a_s;
+  const int4* b_s_ptr = (const int4*)b_s;
+  const uint16_t* g_s_ptr = (const uint16_t*)g_s;
  const int4* zp_ptr = (const int4*)zp;
  const int* g_idx_ptr = (const int*)g_idx;
  const int* perm_ptr = (const int*)perm;
@@ -618,22 +428,41 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
  TORCH_CHECK(max_shared_mem > 0);

+  int major_capability, minor_capability;
+  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
+                         dev);
+  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
+                         dev);
+  TORCH_CHECK(major_capability * 10 + minor_capability >= 80,
+              "marlin kernel only support Ampere or newer GPUs.");
+  if (a_type == vllm::kFE4M3fn) {
+    TORCH_CHECK(major_capability * 10 + minor_capability >= 89,
+                "FP8 only support Ada Lovelace or newer GPUs.");
+    TORCH_CHECK(
+        major_capability * 10 + minor_capability == 89 ||
+            major_capability * 10 + minor_capability == 120,
+        "Marlin W4A8-FP8 only support SM89 or SM120 device (It is slower than "
+        "Marlin W4A16 on other devices).");
+  }
+
  // Set thread config
  exec_config_t exec_cfg;
  thread_config_t thread_tfg;
  if (thread_k != -1 && thread_n != -1) {
-    thread_tfg = thread_config_t{thread_k, thread_n, default_threads};
-    exec_cfg = exec_config_t{1, thread_tfg};
+    thread_tfg = thread_config_t{thread_k, thread_n, thread_k * thread_n / 64};
+    if (blocks_per_sm == -1) blocks_per_sm = 1;
+    exec_cfg = exec_config_t{blocks_per_sm, thread_tfg};
    TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
                " is not divisible by thread_n = ", thread_n);
    TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
                " is not divisible by thread_k = ", thread_k);
  } else {
    // Auto config
-    exec_cfg = determine_exec_config<scalar_t>(
-        q_type, prob_m, prob_n, prob_k, thread_m_blocks, m_block_size_8,
-        num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
-        max_shared_mem);
+    exec_cfg = determine_exec_config(
+        a_type, b_type, c_type, s_type, prob_m, prob_n, prob_k, num_experts,
+        top_k, thread_m_blocks, m_block_size_8, num_bits, group_size,
+        has_act_order, is_k_full, has_zp, is_zp_float, max_shared_mem, sms,
+        is_a_8bit);
    thread_tfg = exec_cfg.tb_cfg;
  }

@@ -647,22 +476,29 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
  int thread_k_blocks = thread_k / 16;
  int thread_n_blocks = thread_n / 16;

-  TORCH_CHECK(
-      is_valid_config(thread_tfg, m_block_size_8, thread_m_blocks, prob_m,
-                      prob_n, prob_k, num_bits, group_size, has_act_order,
-                      is_k_full, has_zp, is_zp_float, max_shared_mem),
-      "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
-      ", thread_k = ", thread_tfg.thread_k,
-      ", thread_n = ", thread_tfg.thread_n,
-      ", num_threads = ", thread_tfg.num_threads, " for MKN = [", prob_m, ", ",
-      prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
-      ", group_size = ", group_size, ", has_act_order = ", has_act_order,
-      ", is_k_full = ", is_k_full, ", has_zp = ", has_zp,
-      ", is_zp_float = ", is_zp_float, ", max_shared_mem = ", max_shared_mem);
+  TORCH_CHECK(is_valid_config(thread_tfg, m_block_size_8, thread_m_blocks,
+                              prob_m, prob_n, prob_k, num_bits, group_size,
+                              has_act_order, is_k_full, has_zp, is_zp_float,
+                              max_shared_mem, is_a_8bit),
+              "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
+              ", thread_k = ", thread_tfg.thread_k,
+              ", thread_n = ", thread_tfg.thread_n,
+              ", num_threads = ", thread_tfg.num_threads, " for MKN = [",
+              prob_m, ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
+              ", group_size = ", group_size,
+              ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
+              ", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float,
+              ", max_shared_mem = ", max_shared_mem);

-  auto kernel = get_marlin_kernel<scalar_t>(
-      q_type, thread_m_blocks, thread_n_blocks, thread_k_blocks, m_block_size_8,
-      has_act_order, has_zp, group_blocks, num_threads, is_zp_float);
+  int sh_cache_size =
+      get_kernel_cache_size(thread_tfg, m_block_size_8, thread_m_blocks, prob_m,
+                            prob_n, prob_k, num_bits, group_size, has_act_order,
+                            is_k_full, has_zp, is_zp_float, is_a_8bit);
+
+  auto kernel = get_marlin_kernel(
+      a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
+      thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
+      num_threads, is_zp_float);

  if (kernel == MarlinDefault) {
    TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
@@ -679,19 +515,20 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
  // avoid ">>>" being formatted to "> > >"
  // clang-format off
  kernel<<<blocks, num_threads, max_shared_mem, stream>>>(
-      A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr,
+      A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, a_s_ptr, b_s_ptr, g_s_ptr, zp_ptr, g_idx_ptr,
      sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr,
      topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m,
-      prob_n, prob_k, locks, has_bias, use_atomic_add, use_fp32_reduce, max_shared_mem);
+      prob_n, prob_k, locks, has_bias, use_atomic_add, use_fp32_reduce);
  // clang-format on
 }

 }  // namespace MARLIN_NAMESPACE_NAME

 torch::Tensor moe_wna16_marlin_gemm(
-    torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
+    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
    torch::Tensor& b_q_weight,
    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& a_scales_or_none,
    std::optional<torch::Tensor> const& global_scale_or_none,
    std::optional<torch::Tensor> const& b_zeros_or_none,
    std::optional<torch::Tensor> const& g_idx_or_none,
@@ -699,11 +536,70 @@ torch::Tensor moe_wna16_marlin_gemm(
    torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
    torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
    int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
-    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
+    vllm::ScalarTypeId const& b_type_id, int64_t size_m, int64_t size_n,
    int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
-    bool is_zp_float) {
-  vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
-  int pack_factor = 32 / b_q_type.size_bits();
+    bool is_zp_float, int64_t thread_k, int64_t thread_n,
+    int64_t blocks_per_sm) {
+  vllm::ScalarTypeId a_type_id, c_type_id, s_type_id;
+
+  auto c_dtype = a.dtype();
+  if (a.scalar_type() == at::ScalarType::Half) {
+    a_type_id = vllm::kFloat16.id();
+    c_type_id = vllm::kFloat16.id();
+  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
+    a_type_id = vllm::kBFloat16.id();
+    c_type_id = vllm::kBFloat16.id();
+  } else {
+    c_dtype = b_scales.dtype();
+    if (b_scales.scalar_type() == at::ScalarType::Half) {
+      c_type_id = vllm::kFloat16.id();
+    } else if (b_scales.scalar_type() == at::ScalarType::BFloat16) {
+      c_type_id = vllm::kBFloat16.id();
+    } else {
+      c_type_id = vllm::kBFloat16.id();
+
+      TORCH_CHECK(c_or_none.has_value(), "c must be passed for W4A8-FP4");
+      torch::Tensor c = c_or_none.value();
+      c_dtype = c.dtype();
+
+      if (c.scalar_type() == at::ScalarType::Half) {
+        c_type_id = vllm::kFloat16.id();
+      } else if (c.scalar_type() == at::ScalarType::BFloat16) {
+        c_type_id = vllm::kBFloat16.id();
+      } else {
+        TORCH_CHECK(false, "unsupported c dtype");
+      }
+    }
+
+    if (a.scalar_type() == at::ScalarType::Float8_e4m3fn) {
+      a_type_id = vllm::kFE4M3fn.id();
+    } else if (a.scalar_type() == at::ScalarType::Char) {
+      a_type_id = vllm::kS8.id();
+    } else {
+      TORCH_CHECK(false, "unsupported `a` scalar_type");
+    }
+  }
+
+  s_type_id = c_type_id;
+  if (b_type_id == vllm::kFE2M1f.id()) {
+    if (b_scales.scalar_type() == at::ScalarType::Float8_e4m3fn) {
+      s_type_id = vllm::kFE4M3fn.id();
+    } else if (b_scales.scalar_type() == at::ScalarType::Float8_e8m0fnu) {
+      s_type_id = vllm::kFE8M0fnu.id();
+    } else {
+      TORCH_CHECK(false,
+                  "When b_type = float4_e2m1f, b_scale scalar type must be",
+                  "float8_e4m3fn (for NVFP4) or float8_e8m0fnu (for MXFP4).");
+    }
+  }
+
+  vllm::ScalarType a_type = vllm::ScalarType::from_id(a_type_id);
+  vllm::ScalarType b_type = vllm::ScalarType::from_id(b_type_id);
+  vllm::ScalarType c_type = vllm::ScalarType::from_id(c_type_id);
+  vllm::ScalarType s_type = vllm::ScalarType::from_id(s_type_id);
+
+  int pack_factor = 32 / b_type.size_bits();
+  int num_experts = b_q_weight.size(0);

  if (moe_block_size != 8) {
    TORCH_CHECK(moe_block_size % 16 == 0,
@@ -745,19 +641,27 @@ torch::Tensor moe_wna16_marlin_gemm(
  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");

-  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
-  // auto -1)
-  int thread_k = -1;
-  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
-  // auto -1)
-  int thread_n = -1;
+  torch::Tensor a_scales;
+  auto options = torch::TensorOptions().dtype(c_dtype).device(a.device());
+  auto options_fp32 =
+      torch::TensorOptions().dtype(at::kFloat).device(a.device());
+
+  if (a_scales_or_none.has_value()) {
+    a_scales = a_scales_or_none.value();
+    TORCH_CHECK(a_type.size_bits() == 8,
+                "a_scales can only be used for 8bit activation.");
+  } else {
+    a_scales = torch::empty({0}, options_fp32);
+    TORCH_CHECK(a_type.size_bits() != 8,
+                "the a_scales parameter must be passed for 8bit activation.");
+  }
+
  // sms: number of SMs to use for the kernel
  int sms = -1;
  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());

  // Alloc buffers
  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
-  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
  torch::Tensor c;
  if (c_or_none.has_value()) {
    c = c_or_none.value();
@@ -774,8 +678,6 @@ torch::Tensor moe_wna16_marlin_gemm(

  // Alloc C tmp buffer that is going to be used for the global reduce
  torch::Tensor c_tmp;
-  auto options_fp32 =
-      torch::TensorOptions().dtype(at::kFloat).device(a.device());
  if (use_fp32_reduce && !use_atomic_add) {
    // max num of threadblocks is sms * 4
    long max_c_tmp_size = min(
@@ -846,11 +748,11 @@ torch::Tensor moe_wna16_marlin_gemm(
  torch::Tensor global_scale;
  if (global_scale_or_none.has_value()) {
    global_scale = global_scale_or_none.value();
-    TORCH_CHECK(b_q_type == vllm::kFE2M1f && group_size == 16,
+    TORCH_CHECK(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn,
                "global_scale can only be used for nvfp4 format.");
  } else {
    global_scale = torch::empty({0}, options);
-    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f && group_size == 16),
+    TORCH_CHECK(!(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn),
                "the global_scale parameter must be passed for nvfp4 format.");
  }

@@ -877,15 +779,15 @@ torch::Tensor moe_wna16_marlin_gemm(
  bool has_zp = b_zeros.size(-1) > 0;
  if (has_zp) {
    TORCH_CHECK(
-        b_q_type == vllm::kU4 || b_q_type == vllm::kU8,
-        "b_q_type must be u4 or u8 when has_zp = True. Got = ", b_q_type.str());
+        b_type == vllm::kU4 || b_type == vllm::kU8,
+        "b_type must be u4 or u8 when has_zp = True. Got = ", b_type.str());
  } else {
-    TORCH_CHECK(b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128 ||
-                    b_q_type == vllm::kFE4M3fn || b_q_type == vllm::kFE2M1f,
-                "b_q_type must be uint4b8, uint8b128, float8_e4m3fn or "
-                "float4_e2m1f when "
-                "has_zp = False. Got = ",
-                b_q_type.str());
+    TORCH_CHECK(b_type == vllm::kU4B8 || b_type == vllm::kU8B128 ||
+                    b_type == vllm::kS4 || b_type == vllm::kS8 ||
+                    b_type == vllm::kFE4M3fn || b_type == vllm::kFE2M1f,
+                "b_type must be uint4b8, uint8b128, int4, int8, "
+                "float8_e4m3fn or float4_e2m1f when has_zp = False. Got = ",
+                b_type.str());
  }

  if (has_zp && is_zp_float) {
@@ -929,71 +831,33 @@ torch::Tensor moe_wna16_marlin_gemm(
              " is below min_workspace_size = ", min_workspace_size);

  int dev = a.get_device();
-  if (a.scalar_type() == at::ScalarType::Half) {
-    void* scales_ptr;
-    if (b_q_type == vllm::kFE2M1f) {
-      if (group_size == 16)
-        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
-      else if (group_size == 32)
-        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
-      else
-        TORCH_CHECK(false,
-                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
-                    "and group_size == 32 (MXFP4)");
-    } else {
-      scales_ptr = b_scales.data_ptr<at::Half>();
-    }

-    MARLIN_NAMESPACE_NAME::marlin_mm<half>(
-        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
-        c_tmp.data_ptr<float>(), b_bias.data_ptr<at::Half>(), scales_ptr,
-        global_scale.data_ptr<at::Half>(), b_zeros.data_ptr(), g_idx.data_ptr(),
-        perm.data_ptr(), a_tmp.data_ptr<at::Half>(),
-        sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
-        num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
-        moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
-        workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full,
-        has_zp, num_groups, group_size, dev,
-        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
-        use_atomic_add, use_fp32_reduce, is_zp_float);
-  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
-    void* scales_ptr;
-    if (b_q_type == vllm::kFE2M1f) {
-      if (group_size == 16)
-        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
-      else if (group_size == 32)
-        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
-      else
-        TORCH_CHECK(false,
-                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
-                    "and group_size == 32 (MXFP4)");
-    } else {
-      scales_ptr = b_scales.data_ptr<at::BFloat16>();
-    }
-
-    MARLIN_NAMESPACE_NAME::marlin_mm<nv_bfloat16>(
-        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
-        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
-        b_bias.data_ptr<at::BFloat16>(), scales_ptr,
-        global_scale.data_ptr<at::BFloat16>(), b_zeros.data_ptr(),
-        g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
-        sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
-        num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
-        moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
-        workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full,
-        has_zp, num_groups, group_size, dev,
-        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
-        use_atomic_add, use_fp32_reduce, is_zp_float);
-  } else {
-    TORCH_CHECK(false,
-                "moe_wna16_marlin_gemm only supports bfloat16 and float16");
+  TORCH_CHECK(a_scales.scalar_type() == at::ScalarType::Float,
+              "scalar type of a_scales must be float");
+  TORCH_CHECK(global_scale.scalar_type() == c.scalar_type(),
+              "scalar type of global_scale must be the same with c");
+  if (a_type.size_bits() == 16) {
+    TORCH_CHECK(
+        a.scalar_type() == c.scalar_type(),
+        "scalar type of a must be the same with c for 16 bit activation");
  }

+  MARLIN_NAMESPACE_NAME::marlin_mm(
+      a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), c_tmp.data_ptr(),
+      b_bias.data_ptr(), a_scales.data_ptr(), b_scales.data_ptr(),
+      global_scale.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(),
+      perm.data_ptr(), a_tmp.data_ptr(), sorted_token_ids.data_ptr(),
+      expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(),
+      topk_weights.data_ptr(), moe_block_size, num_experts, top_k,
+      mul_topk_weights, is_ep, size_m, size_n, size_k, workspace.data_ptr(),
+      a_type, b_type, c_type, s_type, has_bias, has_act_order, is_k_full,
+      has_zp, num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
+      thread_k, thread_n, sms, blocks_per_sm, use_atomic_add, use_fp32_reduce,
+      is_zp_float);
+
  return c;
 }

-#endif
-
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
  m.impl("moe_wna16_marlin_gemm", &moe_wna16_marlin_gemm);
 }
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -63,16 +63,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
  m.def(
      "moe_wna16_marlin_gemm(Tensor! a, Tensor? c_or_none,"
      "Tensor! b_q_weight, Tensor? b_bias_or_none,"
-      "Tensor! b_scales, Tensor? global_scale, Tensor? "
+      "Tensor! b_scales, Tensor? a_scales, Tensor? global_scale, Tensor? "
      "b_zeros_or_none,"
      "Tensor? g_idx_or_none, Tensor? perm_or_none, Tensor! workspace,"
      "Tensor sorted_token_ids,"
      "Tensor! expert_ids, Tensor! num_tokens_past_padded,"
      "Tensor! topk_weights, int moe_block_size, int top_k, "
-      "bool mul_topk_weights, bool is_ep, int b_q_type_id,"
+      "bool mul_topk_weights, bool is_ep, int b_type_id,"
      "int size_m, int size_n, int size_k,"
      "bool is_full_k, bool use_atomic_add,"
-      "bool use_fp32_reduce, bool is_zp_float) -> Tensor");
+      "bool use_fp32_reduce, bool is_zp_float,"
+      "int thread_k, int thread_n, int blocks_per_sm) -> Tensor");
+
  m.def(
      "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
      "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -52,14 +52,13 @@ void paged_attention_v2(
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
    const int64_t blocksparse_head_sliding_step);

-#ifndef USE_ROCM
 void merge_attn_states(torch::Tensor& output,
                       std::optional<torch::Tensor> output_lse,
                       const torch::Tensor& prefix_output,
                       const torch::Tensor& prefix_lse,
                       const torch::Tensor& suffix_output,
                       const torch::Tensor& suffix_lse);
-
+#ifndef USE_ROCM
 void convert_vertical_slash_indexes(
    torch::Tensor& block_count,      // [BATCH, N_HEADS, NUM_ROWS]
    torch::Tensor& block_offset,     // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]
@@ -92,6 +91,12 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
                        torch::Tensor& weight, double epsilon);

+void fused_qk_norm_rope(torch::Tensor& qkv, int64_t num_heads_q,
+                        int64_t num_heads_k, int64_t num_heads_v,
+                        int64_t head_dim, double eps, torch::Tensor& q_weight,
+                        torch::Tensor& k_weight, torch::Tensor& cos_sin_cache,
+                        bool is_neox, torch::Tensor& position_ids);
+
 void apply_repetition_penalties_(torch::Tensor& logits,
                                 const torch::Tensor& prompt_mask,
                                 const torch::Tensor& output_mask,
--- a/Show More
+++ b/Show More