[CI] Reduce wheel size by not shipping debug symbols (#4602 )

bump version to v0.4.2 (#4600 )
[Bugfix] Fix inappropriate content of model_name tag in Prometheus metrics (#3937 )
2024-05-04 21:28:58 -07:00 · 2024-05-04 17:09:49 -07:00 · 2024-05-04 15:39:34 -07:00 · 2024-05-04 20:44:36 +00:00 · 2024-05-04 11:45:16 -07:00 · 2024-05-04 00:18:00 -07:00
514 changed files with 62923 additions and 11178 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -0,0 +1,36 @@
 import os
 import zipfile
 MAX_SIZE_MB = 100
 def print_top_10_largest_files(zip_file):
    with zipfile.ZipFile(zip_file, 'r') as z:
        file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
        file_sizes.sort(key=lambda x: x[1], reverse=True)
        for f, size in file_sizes[:10]:
            print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
 def check_wheel_size(directory):
    for root, _, files in os.walk(directory):
        for f in files:
            if f.endswith(".whl"):
                wheel_path = os.path.join(root, f)
                wheel_size = os.path.getsize(wheel_path)
                wheel_size_mb = wheel_size / (1024 * 1024)
                if wheel_size_mb > MAX_SIZE_MB:
                    print(
                        f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
                        f"compare to the allowed size ({MAX_SIZE_MB} MB).")
                    print_top_10_largest_files(wheel_path)
                    return 1
                else:
                    print(f"Wheel {wheel_path} is within the allowed size "
                          f"({wheel_size_mb} MB).")
    return 0
 if __name__ == "__main__":
    import sys
    sys.exit(check_wheel_size(sys.argv[1]))
--- a/.buildkite/download-images.sh
+++ b/.buildkite/download-images.sh
@@ -0,0 +1,18 @@
 #!/bin/bash
 set -ex
 set -o pipefail
 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 # aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
 mkdir -p images
 cd images
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
 cd -
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -0,0 +1,44 @@
 # This script build the ROCm docker image and runs test inside it.
 set -ex
 # Print ROCm version
 echo "--- ROCm info"
 rocminfo
 echo "--- Resetting GPUs"
 echo "reset" > /opt/amdgpu/etc/gpu_state
 while true; do
        sleep 3
        if grep -q clean /opt/amdgpu/etc/gpu_state; then
                echo "GPUs state is \"clean\""
                break
        fi
 done
 echo "--- Building container"
 sha=$(git rev-parse --short HEAD)
 container_name=rocm_${sha}
 docker build \
        -t ${container_name} \
        -f Dockerfile.rocm \
        --progress plain \
        .
 remove_docker_container() {
   docker rm -f ${container_name} || docker image rm -f ${container_name} || true
 }
 trap remove_docker_container EXIT
 echo "--- Running container"
 docker run \
        --device /dev/kfd --device /dev/dri \
        --network host \
        --rm \
        -e HF_TOKEN \
        --name ${container_name} \
        ${container_name} \
        /bin/bash -c $(echo $1 | sed "s/^'//" | sed "s/'$//")
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@@ -23,8 +23,9 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
 # wait for server to start, timeout after 600 seconds
 timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
 python3 benchmarks/benchmark_serving.py \
-    --backend openai \
+    --backend vllm \
-    --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json \
+    --dataset-name sharegpt \
    --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
    --model meta-llama/Llama-2-7b-chat-hf \
    --num-prompts 20 \
    --endpoint /v1/completions \
@@ -48,7 +49,14 @@ sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
 echo "### Serving Benchmarks" >> benchmark_results.md
 sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
 echo "" >> benchmark_results.md
-tail -n 13 benchmark_serving.txt >> benchmark_results.md # last 13 lines
+echo '```' >> benchmark_results.md
 tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
 echo '```' >> benchmark_results.md
 # if the agent binary is not found, skip uploading the results, exit 0
 if [ ! -f /workspace/buildkite-agent ]; then
    exit 0
 fi
 # upload the results to buildkite
 /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -0,0 +1,14 @@
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Try building the docker image
 docker build -t cpu-test -f Dockerfile.cpu .
 # Setup cleanup
 remove_docker_container() { docker rm -f cpu-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 # Run the image and launch offline inference
 docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 examples/offline_inference.py
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -0,0 +1,51 @@
 # This script build the Neuron docker image and run the API server inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -e
 # Try building the docker image
 aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
 # prune old image and containers to save disk space, and only once a day
 # by using a timestamp file in tmp.
 if [ -f /tmp/neuron-docker-build-timestamp ]; then
    last_build=$(cat /tmp/neuron-docker-build-timestamp)
    current_time=$(date +%s)
    if [ $((current_time - last_build)) -gt 86400 ]; then
        docker system prune -f
        echo $current_time > /tmp/neuron-docker-build-timestamp
    fi
 else
    echo $(date +%s) > /tmp/neuron-docker-build-timestamp
 fi
 docker build -t neuron -f Dockerfile.neuron .
 # Setup cleanup
 remove_docker_container() { docker rm -f neuron || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 # Run the image
 docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
       --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
 # Wait for the server to start
 wait_for_server_to_start() {
    timeout=300
    counter=0
    while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
        sleep 1
        counter=$((counter + 1))
        if [ $counter -ge $timeout ]; then
            echo "Timeout after $timeout seconds"
            break
        fi
    done
 }
 wait_for_server_to_start
 # Test a simple prompt
 curl -X POST -H "Content-Type: application/json" \
    localhost:8000/generate \
    -d '{"prompt": "San Francisco is a"}'
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -12,57 +12,120 @@ steps:
  command: pytest -v -s async_engine
 - label: Basic Correctness Test
-  command: pytest -v -s --forked basic_correctness
+  commands:
  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 - label: Core Test
  mirror_hardwares: [amd]
  command: pytest -v -s core
 - label: Distributed Comm Ops Test
-  command: pytest -v -s --forked test_comm_ops.py
+  command: pytest -v -s test_comm_ops.py
  working_dir: "/vllm-workspace/tests/distributed"
-  num_gpus: 2 # only support 1 or 2 for now.
+  num_gpus: 2
- label: Distributed Correctness Test
+- label: Distributed Tests
  command: pytest -v -s --forked test_basic_distributed_correctness.py
  working_dir: "/vllm-workspace/tests/distributed"
  num_gpus: 2 # only support 1 or 2 for now.
  mirror_hardwares: [amd]
  commands:
  - pytest -v -s test_pynccl_library.py
  - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py
 - label: Distributed Tests (Multiple Groups)
  working_dir: "/vllm-workspace/tests/distributed"
  num_gpus: 4
  commands:
  - pytest -v -s test_pynccl.py
 - label: Engine Test
-  command: pytest -v -s engine
+  mirror_hardwares: [amd]
  command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
 - label: Entrypoints Test
-  command: pytest -v -s entrypoints
+  commands:
  # these tests have to be separated, because each one will allocate all posible GPU memory
  - pytest -v -s entrypoints --ignore=entrypoints/test_server_oot_registration.py
  - pytest -v -s entrypoints/test_server_oot_registration.py
- label: Kernels Test
+- label: Examples Test
-  command: pytest -v -s kernels
+  working_dir: "/vllm-workspace/examples"
-  soft_fail: true
+  mirror_hardwares: [amd]
  commands:
    # install aws cli for llava_example.py
    - pip install awscli
    - python3 offline_inference.py
    - python3 offline_inference_with_prefix.py
    - python3 llm_engine_example.py
    - python3 llava_example.py
 - label: Kernels Test %N
  command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 4
 - label: Models Test
  mirror_hardwares: [amd]
  commands:
-    - pytest -v -s models --forked
+    - bash ../.buildkite/download-images.sh
-  soft_fail: true
+    - pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
 - label: Llava Test
  mirror_hardwares: [amd]
  commands:
    - bash ../.buildkite/download-images.sh
    - pytest -v -s models/test_llava.py
 - label: Prefix Caching Test
  mirror_hardwares: [amd]
  commands:
    - pytest -v -s prefix_caching
 - label: Samplers Test
-  command: pytest -v -s samplers --forked
+  command: pytest -v -s samplers
 - label: LogitsProcessor Test
  mirror_hardwares: [amd]
  command: pytest -v -s test_logits_processor.py
 - label: Worker Test
  mirror_hardwares: [amd]
  command: pytest -v -s worker
- label: LoRA Test
+- label: Speculative decoding tests
-  command: pytest -v -s lora --forked
+  mirror_hardwares: [amd]
  command: pytest -v -s spec_decode
 - label: LoRA Test %N
  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 4
 - label: Tensorizer Test
  command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
 - label: Metrics Test
  command: pytest -v -s metrics
 - label: Quantization Test
  command: pytest -v -s quantization
 - label: Benchmarks
  working_dir: "/vllm-workspace/.buildkite"
  mirror_hardwares: [amd]
  commands:
  - pip install aiohttp
  - bash run-benchmarks.sh
 - label: Documentation Build
-  working_dir: "/vllm-workspace/docs"
+  working_dir: "/vllm-workspace/test_docs/docs"
  no_gpu: True
  commands:
  - pip install -r requirements-docs.txt
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -3,6 +3,7 @@
 {% set default_working_dir = "/vllm-workspace/tests" %}
 steps:
  - label: ":docker: build image"
    commands:
      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
@@ -15,11 +16,39 @@ steps:
          limit: 5
  - wait
  - group: "AMD Tests"
    depends_on: ~
    steps:
    {% for step in steps %}
    {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
      - label: "AMD: {{ step.label }}"
        agents:
          queue: amd
        command: bash .buildkite/run-amd-test.sh "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'"
        env:
          DOCKER_BUILDKIT: "1"
    {% endif %}
    {% endfor %}
  - label: "Neuron Test"
    depends_on: ~
    agents:
      queue: neuron
    command: bash .buildkite/run-neuron-test.sh
    soft_fail: true
  - label: "Intel Test"
    depends_on: ~
    command: bash .buildkite/run-cpu-test.sh
  {% for step in steps %}
  - label: "{{ step.label }}"
    agents:
      queue: kubernetes
    soft_fail: {{ step.soft_fail or false }}
    {% if step.parallelism %}
    parallelism: {{ step.parallelism }}
    {% endif %}
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
@@ -27,6 +56,9 @@ steps:
    plugins:
      - kubernetes:
          podSpec:
            {% if step.num_gpus %}
            priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
            {% endif %}
            volumes:
              - name: dshm
                emptyDir:
@@ -45,6 +77,8 @@ steps:
                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
                {% endif %}
                env:
                  - name: VLLM_USAGE_SOURCE
                    value: ci-test
                  - name: HF_TOKEN
                    valueFrom:
                      secretKeyRef:
--- a/.github/ISSUE_TEMPLATE/100-documentation.yml
+++ b/.github/ISSUE_TEMPLATE/100-documentation.yml
@@ -0,0 +1,22 @@
 name: 📚 Documentation
 description: Report an issue related to https://docs.vllm.ai/
 title: "[Doc]: "
 labels: ["documentation"]
 body:
 - type: textarea
  attributes:
    label: 📚 The doc issue
    description: >
      A clear and concise description of what content in https://docs.vllm.ai/ is an issue.
  validations:
    required: true
 - type: textarea
  attributes:
    label: Suggest a potential alternative/fix
    description: >
      Tell us how we could improve the documentation in this regard.
 - type: markdown
  attributes:
    value: >
      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@@ -0,0 +1,40 @@
 name: 🛠️ Installation
 description: Report an issue here when you hit errors during installation.
 title: "[Installation]: "
 labels: ["installation"]
 body:
 - type: markdown
  attributes:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
 - type: textarea
  attributes:
    label: Your current environment
    description: |
      Please run the following and paste the output below.
      ```sh
      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
    value: |
      ```text
      The output of `python collect_env.py`
      ```
  validations:
    required: true
 - type: textarea
  attributes:
    label: How you are installing vllm
    description: |
      Paste the full command you are trying to execute.
    value: |
      ```sh
      pip install -vvv vllm
      ```
 - type: markdown
  attributes:
    value: >
      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@@ -0,0 +1,38 @@
 name: 💻 Usage
 description: Raise an issue here if you don't know how to use vllm.
 title: "[Usage]: "
 labels: ["usage"]
 body:
 - type: markdown
  attributes:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
 - type: textarea
  attributes:
    label: Your current environment
    description: |
      Please run the following and paste the output below.
      ```sh
      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
    value: |
      ```text
      The output of `python collect_env.py`
      ```
  validations:
    required: true
 - type: textarea
  attributes:
    label: How would you like to use vllm
    description: |
      A detailed description of how you want to use vllm.
    value: |
      I want to run inference of a [specific model](put link here). I don't know how to integrate it with vllm.
 - type: markdown
  attributes:
    value: >
      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/400-bug
+++ b/.github/ISSUE_TEMPLATE/400-bug
@@ -0,0 +1,84 @@
 name: 🐛 Bug report
 description: Raise an issue here if you find a bug.
 title: "[Bug]: "
 labels: ["bug"]
 body:
 - type: markdown
  attributes:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
 - type: textarea
  attributes:
    label: Your current environment
    description: |
      Please run the following and paste the output below.
      ```sh
      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
    value: |
      ```text
      The output of `python collect_env.py`
      ```
  validations:
    required: true
 - type: textarea
  attributes:
    label: 🐛 Describe the bug
    description: |
      Please provide a clear and concise description of what the bug is.
      If relevant, add a minimal example so that we can reproduce the error by running the code. It is very important for the snippet to be as succinct (minimal) as possible, so please take time to trim down any irrelevant code to help us debug efficiently. We are going to copy-paste your code and we expect to get the same result as you did: avoid any external data, and include the relevant imports, etc. For example:
      ```python
      from vllm import LLM, SamplingParams
      prompts = [
          "Hello, my name is",
          "The president of the United States is",
          "The capital of France is",
          "The future of AI is",
      ]
      sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
      llm = LLM(model="facebook/opt-125m")
      outputs = llm.generate(prompts, sampling_params)
      # Print the outputs.
      for output in outputs:
          prompt = output.prompt
          generated_text = output.outputs[0].text
          print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
      ```
      If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
      Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
      If you experienced crashes or hangs, it would be helpful to run vllm with `export VLLM_TRACE_FUNCTION=1` . All the function calls in vllm will be recorded. Inspect these log files, and tell which function crashes or hangs.
    placeholder: |
      A clear and concise description of what the bug is.
      ```python
      # Sample code to reproduce the problem
      ```
      ```
      The error message you got, with the full traceback.
      ```
  validations:
    required: true
 - type: markdown
  attributes:
    value: >
      ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
      - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
      - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/500-feature
+++ b/.github/ISSUE_TEMPLATE/500-feature
@@ -0,0 +1,31 @@
 name: 🚀 Feature request
 description: Submit a proposal/request for a new vllm feature
 title: "[Feature]: "
 labels: ["feature request"]
 body:
 - type: markdown
  attributes:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
 - type: textarea
  attributes:
    label: 🚀 The feature, motivation and pitch
    description: >
      A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
  validations:
    required: true
 - type: textarea
  attributes:
    label: Alternatives
    description: >
      A description of any alternative solutions or features you've considered, if any.
 - type: textarea
  attributes:
    label: Additional context
    description: >
      Add any other context or screenshots about the feature request.
 - type: markdown
  attributes:
    value: >
      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/600-new
+++ b/.github/ISSUE_TEMPLATE/600-new
@@ -0,0 +1,33 @@
 name: 🤗 Support request for a new model from huggingface
 description: Submit a proposal/request for a new model from huggingface
 title: "[New Model]: "
 labels: ["new model"]
 body:
 - type: markdown
  attributes:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
      #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
 - type: textarea
  attributes:
    label: The model to consider.
    description: >
      A huggingface url, pointing to the model, e.g. https://huggingface.co/openai-community/gpt2 .
  validations:
    required: true
 - type: textarea
  attributes:
    label: The closest model vllm already supports.
    description: >
      Here is the list of models already supported by vllm: https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models . Which model is the most similar to the model you want to add support for?
 - type: textarea
  attributes:
    label: What's your difficulty of supporting the model you want?
    description: >
      For example, any new operators or new architecture?
 - type: markdown
  attributes:
    value: >
      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/700-performance
+++ b/.github/ISSUE_TEMPLATE/700-performance
@@ -0,0 +1,52 @@
 name: ⚡ Discussion on the performance of vllm
 description: Submit a proposal/discussion about the performance of vllm
 title: "[Performance]: "
 labels: ["performance"]
 body:
 - type: markdown
  attributes:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
 - type: textarea
  attributes:
    label: Proposal to improve performance
    description: >
      How do you plan to improve vllm's performance?
  validations:
    required: false
 - type: textarea
  attributes:
    label: Report of performance regression
    description: >
      Please provide detailed description of performance comparison to confirm the regression. You may want to run the benchmark script at https://github.com/vllm-project/vllm/tree/main/benchmarks .
  validations:
    required: false
 - type: textarea
  attributes:
    label: Misc discussion on performance
    description: >
      Anything about the performance.
  validations:
    required: false
 - type: textarea
  attributes:
    label: Your current environment (if you think it is necessary)
    description: |
      Please run the following and paste the output below.
      ```sh
      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
    value: |
      ```text
      The output of `python collect_env.py`
      ```
  validations:
    required: false
 - type: markdown
  attributes:
    value: >
      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/750-RFC.yml
+++ b/.github/ISSUE_TEMPLATE/750-RFC.yml
@@ -0,0 +1,49 @@
 name: 💬 Request for comments (RFC).
 description: Ask for feedback on major architectural changes or design choices.
 title: "[RFC]: "
 labels: ["RFC"]
 body:
 - type: markdown
  attributes:
    value: >
      #### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference.
 - type: textarea
  attributes:
    label: Motivation.
    description: >
      The motivation of the RFC.
  validations:
    required: true
 - type: textarea
  attributes:
    label: Proposed Change.
    description: >
      The proposed change of the RFC.
  validations:
    required: true
 - type: textarea
  attributes:
    label: Feedback Period.
    description: >
      The feedback period of the RFC. Usually at least one week.
  validations:
    required: false
 - type: textarea
  attributes:
    label: CC List.
    description: >
      The list of people you want to CC.
  validations:
    required: false
 - type: textarea
  attributes:
    label: Any Other Things.
    description: >
      Any other things you would like to mention.
  validations:
    required: false
 - type: markdown
  attributes:
    value: >
      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/800-misc
+++ b/.github/ISSUE_TEMPLATE/800-misc
@@ -0,0 +1,21 @@
 name: 🎲 Misc/random discussions that do not fit into the above categories.
 description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
 title: "[Misc]: "
 labels: ["misc"]
 body:
 - type: markdown
  attributes:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
 - type: textarea
  attributes:
    label: Anything you want to discuss about vllm.
    description: >
      Anything you want to discuss about vllm.
  validations:
    required: true
 - type: markdown
  attributes:
    value: >
      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1 @@
 blank_issues_enabled: false
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,64 @@
 FILL IN THE PR DESCRIPTION HERE
 FIX #xxxx (*link existing issues this PR will resolve*)
 **BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**
 ---
 <details>
 <!-- inside this <details> section, markdown rendering does not work, so we use raw html here. -->
 <summary><b> PR Checklist (Click to Expand) </b></summary>
 <p>Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.</p>
 <h3>PR Title and Classification</h3>
 <p>Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:</p>
 <ul>
    <li><code>[Bugfix]</code> for bug fixes.</li>
    <li><code>[CI/Build]</code> for build or continuous integration improvements.</li>
    <li><code>[Doc]</code> for documentation fixes and improvements.</li>
    <li><code>[Model]</code> for adding a new model or improving an existing model. Model name should appear in the title.</li>
    <li><code>[Frontend]</code> For changes on the vLLM frontend (e.g., OpenAI API server, <code>LLM</code> class, etc.) </li>
    <li><code>[Kernel]</code> for changes affecting CUDA kernels or other compute kernels.</li>
    <li><code>[Core]</code> for changes in the core vLLM logic (e.g., <code>LLMEngine</code>, <code>AsyncLLMEngine</code>, <code>Scheduler</code>, etc.)</li>
    <li><code>[Hardware][Vendor]</code> for hardware-specific changes. Vendor name should appear in the prefix (e.g., <code>[Hardware][AMD]</code>).</li>
    <li><code>[Misc]</code> for PRs that do not fit the above categories. Please use this sparingly.</li>
 </ul>
 <p><strong>Note:</strong> If the PR spans more than one category, please include all relevant prefixes.</p>
 <h3>Code Quality</h3>
 <p>The PR need to meet the following code quality standards:</p>
 <ul>
    <li>We adhere to <a href="https://google.github.io/styleguide/pyguide.html">Google Python style guide</a> and <a href="https://google.github.io/styleguide/cppguide.html">Google C++ style guide</a>.</li>
    <li>Pass all linter checks. Please use <a href="https://github.com/vllm-project/vllm/blob/main/format.sh"><code>format.sh</code></a> to format your code.</li>
    <li>The code need to be well-documented to ensure future contributors can easily understand the code.</li>
    <li>Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.</li>
    <li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
 </ul>
 <h3>Notes for Large Changes</h3>
 <p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
 <h3>What to Expect for the Reviews</h3>
 <p>The goal of the vLLM team is to be a <i>transparent reviewing machine</i>. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process: </p>
 <ul>
    <li> After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.</li>
    <li> After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.</li>
    <li> After the review, the reviewer will put an <code> action-required</code> label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.</li>
    <li> Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.
 </li>
 </ul>
 <h3>Thank You</h3>
 <p> Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone! </p>
 </details>
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -0,0 +1,50 @@
 name: mypy
 on:
  # Trigger the workflow on push or pull request,
  # but only for the main branch
  push:
    branches:
      - main
  pull_request:
    branches:
      - main
 jobs:
  ruff:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.8", "3.9", "3.10", "3.11"]
    steps:
    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v2
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install mypy==1.9.0
        pip install types-setuptools
        pip install types-PyYAML
        pip install types-requests
        pip install types-setuptools
    - name: Mypy
      run: |
        mypy vllm/attention --config-file pyproject.toml
        mypy vllm/core --config-file pyproject.toml
        mypy vllm/distributed --config-file pyproject.toml
        mypy vllm/entrypoints --config-file pyproject.toml
        mypy vllm/executor --config-file pyproject.toml
        mypy vllm/usage --config-file pyproject.toml
        mypy vllm/*.py --config-file pyproject.toml
        mypy vllm/transformers_utils --config-file pyproject.toml
        mypy vllm/engine  --config-file pyproject.toml
        mypy vllm/worker --config-file pyproject.toml
        mypy vllm/spec_decode --config-file pyproject.toml
        mypy vllm/model_executor  --config-file pyproject.toml
        mypy vllm/lora --config-file pyproject.toml
        mypy vllm/logging --config-file pyproject.toml
        mypy vllm/model_executor --config-file pyproject.toml
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -49,13 +49,16 @@ jobs:
      matrix:
          os: ['ubuntu-20.04']
          python-version: ['3.8', '3.9', '3.10', '3.11']
-          pytorch-version: ['2.1.2']  # Must be the most recent version that meets requirements.txt.
+          pytorch-version: ['2.3.0']  # Must be the most recent version that meets requirements-cuda.txt.
          cuda-version: ['11.8', '12.1']
    steps:
      - name: Checkout
        uses: actions/checkout@v3
      - name: Setup ccache
        uses: hendrikmuhs/ccache-action@v1.2
      - name: Set up Linux Env
        if: ${{ runner.os == 'Linux' }}
        run: |
@@ -76,6 +79,8 @@ jobs:
      - name: Build wheel
        shell: bash
        env:
          CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
        run: |
          bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
          wheel_name=$(ls dist/*whl | xargs -n 1 basename)
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -15,7 +15,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.10"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
    steps:
    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
@@ -25,10 +25,13 @@ jobs:
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
-        pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1
+        pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
    - name: Analysing the code with ruff
      run: |
-        ruff vllm tests
+        ruff .
    - name: Spelling check with codespell
      run: |
-         codespell --toml pyproject.toml
+        codespell --toml pyproject.toml
    - name: Run isort
      run: |
        isort . --check-only
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@@ -9,12 +9,13 @@ LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
 # Install requirements
 $python_executable -m pip install wheel packaging
-$python_executable -m pip install -r requirements.txt
+$python_executable -m pip install -r requirements-cuda.txt
 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
 # Make sure punica is built for the release (for LoRA)
 export VLLM_INSTALL_PUNICA_KERNELS=1
-
+# Make sure release wheels are built for the following architectures
 export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
 # Build
 $python_executable setup.py bdist_wheel --dist-dir=dist
--- a/.github/workflows/scripts/create_release.js
+++ b/.github/workflows/scripts/create_release.js
@@ -8,7 +8,7 @@ module.exports = async (github, context, core) => {
 			generate_release_notes: true,
 			name: process.env.RELEASE_TAG,
 			owner: context.repo.owner,
-			prerelease: false,
+			prerelease: true,
 			repo: context.repo.repo,
 			tag_name: process.env.RELEASE_TAG,
 		});
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -14,7 +14,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.10"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
    steps:
    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
--- a/.gitignore
+++ b/.gitignore
@@ -70,6 +70,8 @@ instance/
 # Sphinx documentation
 docs/_build/
 docs/source/getting_started/examples/*.rst
 !**/*.template.rst
 # PyBuilder
 .pybuilder/
@@ -181,6 +183,7 @@ _build/
 # hip files generated by PyTorch
 *.hip
 *_hip*
 hip_compat.h
 # Benchmark dataset
 *.json
--- a/.yapfignore
+++ b/.yapfignore
@@ -0,0 +1 @@
 collect_env.py
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,294 @@
 cmake_minimum_required(VERSION 3.21)
 project(vllm_extensions LANGUAGES CXX)
 option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
 include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 #
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
 set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
 # Supported NVIDIA architectures.
 set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
 #
 # Supported/expected torch versions for CUDA/ROCm.
 #
 # Currently, having an incorrect pytorch version results in a warning
 # rather than an error.
 #
 # Note: the CUDA torch version is derived from pyproject.toml and various
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
 set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
 set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
 set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
 #
 # Try to find python package with an executable that exactly matches
 # `VLLM_PYTHON_EXECUTABLE` and is one of the supported versions.
 #
 if (VLLM_PYTHON_EXECUTABLE)
  find_python_from_executable(${VLLM_PYTHON_EXECUTABLE} "${PYTHON_SUPPORTED_VERSIONS}")
 else()
  message(FATAL_ERROR
    "Please set VLLM_PYTHON_EXECUTABLE to the path of the desired python version"
    " before running cmake configure.")
 endif()
 #
 # Update cmake's `CMAKE_PREFIX_PATH` with torch location.
 #
 append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
 # Ensure the 'nvcc' command is in the PATH
 find_program(NVCC_EXECUTABLE nvcc)
 if (CUDA_FOUND AND NOT NVCC_EXECUTABLE)
    message(FATAL_ERROR "nvcc not found")
 endif()
 #
 # Import torch cmake configuration.
 # Torch also imports CUDA (and partially HIP) languages with some customizations,
 # so there is no need to do this explicitly with check_language/enable_language,
 # etc.
 #
 find_package(Torch REQUIRED)
 #
 # Normally `torch.utils.cpp_extension.CUDAExtension` would add
 # `libtorch_python.so` for linking against an extension. Torch's cmake
 # configuration does not include this library (presumably since the cmake
 # config is used for standalone C++ binaries that link against torch).
 # The `libtorch_python.so` library defines some of the glue code between
 # torch/python via pybind and is required by VLLM extensions for this
 # reason. So, add it by manually with `find_library` using torch's
 # installed library path.
 #
 find_library(torch_python_LIBRARY torch_python PATHS
  "${TORCH_INSTALL_PREFIX}/lib")
 #
 # Forward the non-CUDA device extensions to external CMake scripts.
 #
 if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
    NOT VLLM_TARGET_DEVICE STREQUAL "rocm")
    if (VLLM_TARGET_DEVICE STREQUAL "cpu")
        include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
    else()
        message(FATAL_ERROR "Unsupported vLLM target device: ${VLLM_TARGET_DEVICE}")
    endif()
    return()
 endif()
 #
 # Set up GPU language and check the torch version and warn if it isn't
 # what is expected.
 #
 if (NOT HIP_FOUND AND CUDA_FOUND)
  set(VLLM_GPU_LANG "CUDA")
  if (NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_CUDA})
    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_CUDA} "
      "expected for CUDA build, saw ${Torch_VERSION} instead.")
  endif()
 elseif(HIP_FOUND)
  set(VLLM_GPU_LANG "HIP")
  # Importing torch recognizes and sets up some HIP/ROCm configuration but does
  # not let cmake recognize .hip files. In order to get cmake to understand the
  # .hip extension automatically, HIP must be enabled explicitly.
  enable_language(HIP)
  # ROCm 5.x
  if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} "
      "expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.")
  endif()
  # ROCm 6.x
  if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND
      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X})
    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
      "expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
  endif()
 else()
  message(FATAL_ERROR "Can't find CUDA or HIP installation.")
 endif()
 #
 # Override the GPU architectures detected by cmake/torch and filter them by
 # the supported versions for the current language.
 # The final set of arches is stored in `VLLM_GPU_ARCHES`.
 #
 override_gpu_arches(VLLM_GPU_ARCHES
  ${VLLM_GPU_LANG}
  "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
 #
 # Query torch for additional GPU compilation flags for the given
 # `VLLM_GPU_LANG`.
 # The final set of arches is stored in `VLLM_GPU_FLAGS`.
 #
 get_torch_gpu_compiler_flags(VLLM_GPU_FLAGS ${VLLM_GPU_LANG})
 #
 # Set nvcc parallelism.
 #
 if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
 endif()
 #
 # Define extension targets
 #
 #
 # _C extension
 #
 set(VLLM_EXT_SRC
  "csrc/cache_kernels.cu"
  "csrc/attention/attention_kernels.cu"
  "csrc/pos_encoding_kernels.cu"
  "csrc/activation_kernels.cu"
  "csrc/layernorm_kernels.cu"
  "csrc/quantization/squeezellm/quant_cuda_kernel.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/fp8/fp8_cuda_kernels.cu"
  "csrc/cuda_utils_kernels.cu"
  "csrc/moe_align_block_size_kernels.cu"
  "csrc/pybind.cpp")
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_EXT_SRC
    "csrc/quantization/aqlm/gemm_kernels.cu"
    "csrc/quantization/awq/gemm_kernels.cu"
    "csrc/quantization/marlin/marlin_cuda_kernel.cu"
    "csrc/quantization/gptq_marlin/gptq_marlin.cu"
    "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
    "csrc/custom_all_reduce.cu")
 endif()
 define_gpu_extension_target(
  _C
  DESTINATION vllm
  LANGUAGE ${VLLM_GPU_LANG}
  SOURCES ${VLLM_EXT_SRC}
  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
  ARCHITECTURES ${VLLM_GPU_ARCHES}
  WITH_SOABI)
 #
 # _moe_C extension
 #
 set(VLLM_MOE_EXT_SRC
  "csrc/moe/moe_ops.cpp"
  "csrc/moe/topk_softmax_kernels.cu")
 define_gpu_extension_target(
  _moe_C
  DESTINATION vllm
  LANGUAGE ${VLLM_GPU_LANG}
  SOURCES ${VLLM_MOE_EXT_SRC}
  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
  ARCHITECTURES ${VLLM_GPU_ARCHES}
  WITH_SOABI)
 #
 # _punica_C extension
 #
 set(VLLM_PUNICA_EXT_SRC
  "csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
  "csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
  "csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
  "csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
  "csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
  "csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
  "csrc/punica/punica_ops.cc")
 #
 # Copy GPU compilation flags+update for punica
 #
 set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS})
 list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
  "-D__CUDA_NO_HALF_OPERATORS__"
  "-D__CUDA_NO_HALF_CONVERSIONS__"
  "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
  "-D__CUDA_NO_HALF2_OPERATORS__")
 #
 # Filter out CUDA architectures < 8.0 for punica.
 #
 if (${VLLM_GPU_LANG} STREQUAL "CUDA")
  set(VLLM_PUNICA_GPU_ARCHES)
  foreach(ARCH ${VLLM_GPU_ARCHES})
    string_to_ver(CODE_VER ${ARCH})
    if (CODE_VER GREATER_EQUAL 8.0)
      list(APPEND VLLM_PUNICA_GPU_ARCHES ${ARCH})
    endif()
  endforeach()
  message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
 endif()
 if (VLLM_PUNICA_GPU_ARCHES)
  define_gpu_extension_target(
    _punica_C
    DESTINATION vllm
    LANGUAGE ${VLLM_GPU_LANG}
    SOURCES ${VLLM_PUNICA_EXT_SRC}
    COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
    ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
    WITH_SOABI)
 else()
  message(WARNING "Unable to create _punica_C target because none of the "
    "requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")
 endif()
 #
 # Add the `default` target which detects which extensions should be
 # built based on platform/architecture.  This is the same logic that
 # setup.py uses to select which extensions should be built and should
 # be kept in sync.
 #
 # The `default` target makes direct use of cmake easier since knowledge
 # of which extensions are supported has been factored in, e.g.
 #
 # mkdir build && cd build
 # cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
 # cmake --build . --target default
 #
 add_custom_target(default)
 if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
  message(STATUS "Enabling C extension.")
  add_dependencies(default _C)
 endif()
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  message(STATUS "Enabling moe extension.")
  add_dependencies(default _moe_C)
  # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
  # VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
  # there are supported target arches.
  if (VLLM_PUNICA_GPU_ARCHES AND
      (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS))
    message(STATUS "Enabling punica extension.")
    add_dependencies(default _punica_C)
  endif()
 endif()
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -21,7 +21,6 @@ Express your support on Twitter if vLLM aids you, or simply offer your appreciat
 ### Build from source
 ```bash
 pip install -r requirements.txt
 pip install -e .  # This may take several minutes.
 ```
@@ -30,6 +29,8 @@ pip install -e .  # This may take several minutes.
 ```bash
 pip install -r requirements-dev.txt
 # linting and formatting
 bash format.sh
 # Static type checking
 mypy
 # Unit tests
@@ -45,31 +46,9 @@ pytest tests/
 If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
 If not, please file a new issue, providing as much relevant information as possible.
-### Coding Style Guide
+### Pull Requests & Code Reviews
-In general, we adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
+Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution.
 We include a formatting script [`format.sh`](./format.sh) to format the code.
 ### Pull Requests
 When submitting a pull request:
 1. Make sure your code has been rebased on top of the latest commit on the main branch.
 2. Ensure code is properly formatted by running [`format.sh`](./format.sh).
 3. Include a detailed description of the changes in the pull request.
 Explain why you made the changes you did.
 If your pull request fixes an open issue, please include a reference to it in the description.
 ### Code Reviews
 All submissions, including submissions by project members, require a code review.
 To make the review process as smooth as possible, please:
 1. Keep your changes as concise as possible.
 If your pull request involves multiple unrelated changes, consider splitting it into separate pull requests.
 2. Respond to all comments within a reasonable time frame.
 If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.
 ### Thank You
--- a/138
+++ b/138
@@ -1,8 +1,13 @@
 # The vLLM Dockerfile is used to construct vLLM image that can be directly used
 # to run the OpenAI compatible server.
 # Please update any changes made here to
 # docs/source/dev/dockerfile/dockerfile.rst and
 # docs/source/assets/dev/dockerfile-stages-dependency.png
 #################### BASE BUILD IMAGE ####################
-FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
+# prepare basic build environment
 FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev
 RUN apt-get update -y \
    && apt-get install -y python3-pip git
@@ -11,23 +16,31 @@ RUN apt-get update -y \
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
 # or future versions of triton.
-RUN ldconfig /usr/local/cuda-12.1/compat/
+RUN ldconfig /usr/local/cuda-12.4/compat/
 WORKDIR /workspace
 # install build and runtime dependencies
-COPY requirements.txt requirements.txt
+COPY requirements-common.txt requirements-common.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements.txt
+    pip install -r requirements-cuda.txt
 # install development dependencies
 COPY requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements-dev.txt
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
 # explicitly set the list to avoid issues with torch 2.2
 # see https://github.com/pytorch/pytorch/pull/123243
 ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 #################### BASE BUILD IMAGE ####################
-#################### EXTENSION BUILD IMAGE ####################
+#################### WHEEL BUILD IMAGE ####################
 FROM dev AS build
 # install build dependencies
@@ -35,16 +48,19 @@ COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements-build.txt
-# copy input files
+# install compiler cache to speed up compilation leveraging local or remote caching
 RUN apt-get update -y && apt-get install -y ccache
 # files and directories related to build wheels
 COPY csrc csrc
 COPY setup.py setup.py
-COPY requirements.txt requirements.txt
+COPY cmake cmake
 COPY CMakeLists.txt CMakeLists.txt
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 COPY pyproject.toml pyproject.toml
-COPY vllm/__init__.py vllm/__init__.py
+COPY vllm vllm
 # cuda arch list used by torch
 ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 # max jobs used by Ninja to build extensions
 ARG max_jobs=2
 ENV MAX_JOBS=${max_jobs}
@@ -54,52 +70,94 @@ ENV NVCC_THREADS=$nvcc_threads
 # make sure punica kernels are built (for LoRA)
 ENV VLLM_INSTALL_PUNICA_KERNELS=1
-RUN python3 setup.py build_ext --inplace
+ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/pip \
    python3 setup.py bdist_wheel --dist-dir=dist
 # check the size of the wheel, we cannot upload wheels larger than 100MB
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 RUN python3 check-wheel-size.py dist
 # the `vllm_nccl` package must be installed from source distribution
 # pip is too smart to store a wheel in the cache, and other CI jobs
 # will directly use the wheel from the cache, which is not what we want.
 # we need to remove it manually
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip cache remove vllm_nccl*
 #################### EXTENSION Build IMAGE ####################
 #################### FLASH_ATTENTION Build IMAGE ####################
 FROM dev as flash-attn-builder
 # max jobs used for build
 ARG max_jobs=2
 ENV MAX_JOBS=${max_jobs}
 # flash attention version
 ARG flash_attn_version=v2.5.8
 ENV FLASH_ATTN_VERSION=${flash_attn_version}
 WORKDIR /usr/src/flash-attention-v2
 # Download the wheel or build it if a pre-compiled release doesn't exist
 RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
    --no-build-isolation --no-deps --no-cache-dir
 #################### FLASH_ATTENTION Build IMAGE ####################
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
 FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
 WORKDIR /vllm-workspace
 RUN apt-get update -y \
    && apt-get install -y python3-pip git vim
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-12.4/compat/
 # install vllm wheel first, so that torch etc will be installed
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/pip \
    pip install dist/*.whl --verbose
 RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
    --mount=type=cache,target=/root/.cache/pip \
    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
 #################### vLLM installation IMAGE ####################
 #################### TEST IMAGE ####################
 # image to run unit testing suite
-FROM dev AS test
+# note that this uses vllm installed by `pip`
 FROM vllm-base AS test
 # copy pytorch extensions separately to avoid having to rebuild
 # when python code changes
 WORKDIR /vllm-workspace
 # ADD is used to preserve directory structure
 ADD . /vllm-workspace/
 COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
 # ignore build dependencies installation because we are using pre-complied extensions
 RUN rm pyproject.toml
 RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
 #################### TEST IMAGE ####################
-
+# install development dependencies (for testing)
 #################### RUNTIME BASE IMAGE ####################
 # We used base cuda image because pytorch installs its own cuda libraries.
 # However cupy depends on cuda libraries so we had to switch to the runtime image
 # In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
 FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 AS vllm-base
 # libnccl required for ray
 RUN apt-get update -y \
    && apt-get install -y python3-pip
 WORKDIR /workspace
 COPY requirements.txt requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements.txt
+    pip install -r requirements-dev.txt
 #################### RUNTIME BASE IMAGE ####################
 # doc requires source code
 # we hide them inside `test_docs/` , so that this source code
 # will not be imported by other tests
 RUN mkdir test_docs
 RUN mv docs test_docs/
 RUN mv vllm test_docs/
 #################### TEST IMAGE ####################
 #################### OPENAI API SERVER ####################
 # openai api server alternative
 FROM vllm-base AS vllm-openai
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate
+    pip install accelerate hf_transfer modelscope
-COPY --from=build /workspace/vllm/*.so /workspace/vllm/
+ENV VLLM_USAGE_SOURCE production-docker-image
 COPY vllm vllm
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 #################### OPENAI API SERVER ####################
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -0,0 +1,20 @@
 # This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
 FROM ubuntu:22.04
 RUN apt-get update  -y \
    && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 RUN pip install --upgrade pip \
    && pip install wheel packaging ninja setuptools>=49.4.0 numpy
 COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm
 RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
 RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
 CMD ["/bin/bash"]
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -0,0 +1,36 @@
 # default base image
 ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"
 FROM $BASE_IMAGE
 RUN echo "Base image is $BASE_IMAGE"
 # Install some basic utilities
 RUN apt-get update && apt-get install python3 python3-pip -y
 ### Mount Point ###
 # When launching the container, mount the code directory to /app
 ARG APP_MOUNT=/app
 VOLUME [ ${APP_MOUNT} ]
 WORKDIR ${APP_MOUNT}
 RUN python3 -m pip install --upgrade pip
 RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
 RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
 RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 COPY ./vllm /app/vllm/vllm
 COPY ./setup.py /app/vllm/setup.py
 COPY ./requirements-common.txt /app/vllm/requirements-common.txt
 COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
 RUN cd /app/vllm \
    && python3 -m pip install -U -r requirements-neuron.txt
 ENV VLLM_BUILD_WITH_NEURON 1
 RUN cd /app/vllm \
    && pip install -e . \
    && cd ..
 CMD ["/bin/bash"]
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -14,7 +14,7 @@ RUN echo "Base image is $BASE_IMAGE"
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
 RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
-ARG FA_BRANCH="3d2b6f5"
+ARG FA_BRANCH="ae7928c"
 RUN echo "FA_BRANCH is $FA_BRANCH"
 # whether to build flash-attention
@@ -23,6 +23,9 @@ RUN echo "FA_BRANCH is $FA_BRANCH"
 # In that case, we need to use the python reference attention implementation in vllm
 ARG BUILD_FA="1"
 # whether to build triton on rocm
 ARG BUILD_TRITON="1"
 # Install some basic utilities
 RUN apt-get update && apt-get install python3 python3-pip -y
@@ -43,7 +46,7 @@ RUN apt-get update && apt-get install -y \
 ### Mount Point ###
 # When launching the container, mount the code directory to /app
-ARG APP_MOUNT=/app
+ARG APP_MOUNT=/vllm-workspace
 VOLUME [ ${APP_MOUNT} ]
 WORKDIR ${APP_MOUNT}
@@ -70,26 +73,35 @@ RUN if [ "$BUILD_FA" = "1" ]; then \
    && cd ..; \
    fi
 COPY ./ /app/vllm
 RUN python3 -m pip install --upgrade pip
 RUN python3 -m pip install xformers==0.0.23 --no-deps
 # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
 # Manually removed it so that later steps of numpy upgrade can continue
 RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
    rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
-RUN cd /app \
+# build triton
-    && cd vllm \
+RUN if [ "$BUILD_TRITON" = "1" ]; then \
-    && pip install -U -r requirements-rocm.txt \
+    mkdir -p libs \
-    && if [ "$BUILD_FA" = "1" ]; then \
+    && cd libs \
-       bash patch_xformers.rocm.sh; fi \
+    && pip uninstall -y triton \
-    && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch \
+    && git clone https://github.com/ROCm/triton.git \
    && cd triton/python \
    && pip3 install . \
    && cd ../..; \
    fi
 WORKDIR /vllm-workspace
 COPY . .
 RUN python3 -m pip install --upgrade pip numba
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -U -r requirements-rocm.txt \
    && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
    && python3 setup.py install \
    && cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
    && cd ..
 RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir ray[all]
+RUN python3 -m pip install --no-cache-dir ray[all]==2.9.3
 CMD ["/bin/bash"]
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,10 @@
 include LICENSE
-include requirements.txt
+include requirements-common.txt
 include requirements-cuda.txt
 include requirements-rocm.txt
 include requirements-neuron.txt
 include requirements-cpu.txt
 include CMakeLists.txt
 recursive-include cmake *
 recursive-include csrc *
--- a/README.md
+++ b/README.md
@@ -14,9 +14,8 @@ Easy, fast, and cheap LLM serving for everyone
 </p>
 ---
 *Latest News* 🔥
 - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
 - [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
 - [2024/01] Added ROCm 6.0 support to vLLM.
 - [2023/12] Added ROCm 5.7 support to vLLM.
@@ -58,6 +57,8 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
 - Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.)
 - BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
 - ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
 - Command-R (`CohereForAI/c4ai-command-r-v01`, etc.)
 - DBRX (`databricks/dbrx-base`, `databricks/dbrx-instruct` etc.)
 - DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.)
 - Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
 - Gemma (`google/gemma-2b`, `google/gemma-7b`, etc.)
@@ -67,18 +68,23 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
 - GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
 - InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
 - InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
- LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
+- Jais (`core42/jais-13b`, `core42/jais-13b-chat`, `core42/jais-30b-v3`, `core42/jais-30b-chat-v3`, etc.)
 - LLaMA, Llama 2, and Meta Llama 3 (`meta-llama/Meta-Llama-3-8B-Instruct`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
 - MiniCPM (`openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, etc.)
 - Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.)
+- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.)
 - MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
- OLMo (`allenai/OLMo-1B`, `allenai/OLMo-7B`, etc.)
+- OLMo (`allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.)
 - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
 - Orion (`OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.)
 - Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
 - Phi-3 (`microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, etc.)
 - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
- Qwen2 (`Qwen/Qwen2-7B-beta`, `Qwen/Qwen-7B-Chat-beta`, etc.)
+- Qwen2 (`Qwen/Qwen1.5-7B`, `Qwen/Qwen1.5-7B-Chat`, etc.)
 - Qwen2MoE (`Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.)
 - StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.)
 - Starcoder2(`bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.)
 - Xverse (`xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.)
 - Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
 Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -1,8 +1,10 @@
 import json
 import os
 import sys
 import time
-from dataclasses import dataclass
+import traceback
-from typing import Optional
+from dataclasses import dataclass, field
 from typing import List, Optional
 import aiohttp
 from tqdm.asyncio import tqdm
@@ -25,9 +27,12 @@ class RequestFuncInput:
 class RequestFuncOutput:
    generated_text: str = ""
    success: bool = False
-    latency: float = 0
+    latency: float = 0.0
-    ttft: float = 0
+    ttft: float = 0.0  # Time to first token
    itl: List[float] = field(
        default_factory=list)  # List of inter-token latencies
    prompt_len: int = 0
    error: str = ""
 async def async_request_tgi(
@@ -53,73 +58,41 @@ async def async_request_tgi(
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
-        ttft = 0
+        ttft = 0.0
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
            async with session.post(url=api_url, json=payload) as response:
                if response.status == 200:
-                    async for data in response.content.iter_any():
+                    async for chunk_bytes in response.content:
-                        if ttft == 0:
+                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue
                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
                                              "data:")
                        data = json.loads(chunk)
                        timestamp = time.perf_counter()
                        # First token
                        if ttft == 0.0:
                            ttft = time.perf_counter() - st
                            output.ttft = ttft
                    output.latency = time.perf_counter() - st
-                    body = data.decode("utf-8").lstrip("data:")
+                        # Decoding phase
-                    output.generated_text = json.loads(body)["generated_text"]
+                        else:
                            output.itl.append(timestamp -
                                              most_recent_timestamp)
                        most_recent_timestamp = timestamp
                    output.latency = most_recent_timestamp - st
                    output.success = True
-                else:
+                    output.generated_text = data["generated_text"]
-                    output.success = False
+        except Exception:
        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
            output.success = False
        if pbar:
            pbar.update(1)
        return output
 async def async_request_vllm(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith("generate")
    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        payload = {
            "prompt": request_func_input.prompt,
            "n": 1,
            "best_of": request_func_input.best_of,
            "use_beam_search": request_func_input.use_beam_search,
            "temperature": 0.0 if request_func_input.use_beam_search else 1.0,
            "top_p": 1.0,
            "max_tokens": request_func_input.output_len,
            "ignore_eos": True,
            "stream": True,
        }
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
        ttft = 0
        st = time.perf_counter()
        try:
            async with session.post(url=api_url, json=payload) as response:
                if response.status == 200:
                    async for data in response.content.iter_any():
                        if ttft == 0:
                            ttft = time.perf_counter() - st
                            output.ttft = ttft
                    output.latency = time.perf_counter() - st
                    # When streaming, '\0' is appended to the end of the response.
                    body = data.decode("utf-8").strip("\0")
                    output.generated_text = json.loads(
                        body)["text"][0][len(request_func_input.prompt):]
                    output.success = True
                else:
                    output.success = False
        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
            output.success = False
            exc_info = sys.exc_info()
            output.error = "".join(traceback.format_exception(*exc_info))
        if pbar:
            pbar.update(1)
@@ -146,26 +119,46 @@ async def async_request_trt_llm(
        }
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
        ttft = 0
        ttft = 0.0
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
-            async with session.post(url=api_url, json=payload) as resp:
+            async with session.post(url=api_url, json=payload) as response:
-                if resp.status == 200:
+                if response.status == 200:
-                    async for data in resp.content.iter_any():
+                    async for chunk_bytes in response.content:
-                        if ttft == 0:
+                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue
                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
                                              "data:")
                        data = json.loads(chunk)
                        output.generated_text += data["text_output"]
                        timestamp = time.perf_counter()
                        # First token
                        if ttft == 0.0:
                            ttft = time.perf_counter() - st
                            output.ttft = ttft
                    output.latency = time.perf_counter() - st
-                    body = data.decode("utf-8").lstrip("data:")
+                        # Decoding phase
-                    output.generated_text = json.loads(body)["text_output"]
+                        else:
                            output.itl.append(timestamp -
                                              most_recent_timestamp)
                        most_recent_timestamp = timestamp
                    output.latency = most_recent_timestamp - st
                    output.success = True
                else:
                    output.error = response.reason or ""
                    output.success = False
-        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+        except Exception:
            output.success = False
            exc_info = sys.exc_info()
            output.error = "".join(traceback.format_exception(*exc_info))
        if pbar:
            pbar.update(1)
@@ -181,34 +174,35 @@ async def async_request_deepspeed_mii(
        assert not request_func_input.use_beam_search
        payload = {
-            "prompts": request_func_input.prompt,
+            "prompt": request_func_input.prompt,
-            "max_new_tokens": request_func_input.output_len,
+            "max_tokens": request_func_input.output_len,
-            "ignore_eos": True,
+            "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
            "do_sample": True,
            "temperature":
            0.01,  # deepspeed-mii does not accept 0.0 temperature.
            "top_p": 1.0,
        }
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
-        # DeepSpeed-MII doesn't support streaming as of Jan 28 2024, will use 0 as placeholder.
+        # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
-        # https://github.com/microsoft/DeepSpeed-MII/pull/311
+        # will use 0 as placeholder.
        # See https://github.com/microsoft/DeepSpeed-MII/pull/311
        output.ttft = 0
        st = time.perf_counter()
        try:
            async with session.post(url=request_func_input.api_url,
-                                    json=payload) as resp:
+                                    json=payload) as response:
-                if resp.status == 200:
+                if response.status == 200:
-                    parsed_resp = await resp.json()
+                    parsed_resp = await response.json()
                    output.latency = time.perf_counter() - st
-                    output.generated_text = parsed_resp[0]["generated_text"]
+                    output.generated_text = parsed_resp["text"][0]
                    output.success = True
                else:
                    output.error = response.reason or ""
                    output.success = False
-        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+        except Exception:
            output.success = False
            exc_info = sys.exc_info()
            output.error = "".join(traceback.format_exception(*exc_info))
        if pbar:
            pbar.update(1)
@@ -220,7 +214,9 @@ async def async_request_openai_completions(
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
-    assert api_url.endswith("v1/completions")
+    assert api_url.endswith(
        "v1/completions"
    ), "OpenAI Completions API URL must end with 'v1/completions'."
    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        assert not request_func_input.use_beam_search
@@ -240,45 +236,154 @@ async def async_request_openai_completions(
        output.prompt_len = request_func_input.prompt_len
        generated_text = ""
-        ttft = 0
+        ttft = 0.0
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
            async with session.post(url=api_url, json=payload,
                                    headers=headers) as response:
                if response.status == 200:
-                    async for chunk in response.content:
+                    async for chunk_bytes in response.content:
-                        if ttft == 0:
+                        chunk_bytes = chunk_bytes.strip()
-                            ttft = time.perf_counter() - st
+                        if not chunk_bytes:
                            output.ttft = ttft
                        chunk = chunk.strip()
                        if not chunk:
                            continue
-                        chunk = chunk.decode("utf-8").lstrip("data: ")
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
                                              "data: ")
                        if chunk == "[DONE]":
                            latency = time.perf_counter() - st
                        else:
-                            body = json.loads(chunk)
+                            data = json.loads(chunk)
-                            generated_text += body["choices"][0]["text"]
+
                            if data["choices"][0]["text"]:
                                timestamp = time.perf_counter()
                                # First token
                                if ttft == 0.0:
                                    ttft = time.perf_counter() - st
                                    output.ttft = ttft
                                # Decoding phase
                                # NOTE: Some completion API might have a last
                                # usage summary response without a token so we
                                # do not want to include as inter-token-latency
                                elif data.get("usage", None) is None:
                                    output.itl.append(timestamp -
                                                      most_recent_timestamp)
                                most_recent_timestamp = timestamp
                                generated_text += data["choices"][0]["text"]
                    output.generated_text = generated_text
                    output.success = True
                    output.latency = latency
-                else:
+        except Exception:
                    output.success = False
        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
            output.success = False
            exc_info = sys.exc_info()
            output.error = "".join(traceback.format_exception(*exc_info))
    if pbar:
        pbar.update(1)
    return output
 async def async_request_openai_chat_completions(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(
        "v1/chat/completions"
    ), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."
    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        assert not request_func_input.use_beam_search
        payload = {
            "model": request_func_input.model,
            "messages": [
                {
                    "role": "user",
                    "content": request_func_input.prompt,
                },
            ],
            "temperature": 0.0,
            "max_tokens": request_func_input.output_len,
            "stream": True,
        }
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
        }
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
        generated_text = ""
        ttft = 0.0
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
            async with session.post(url=api_url, json=payload,
                                    headers=headers) as response:
                if response.status == 200:
                    async for chunk_bytes in response.content:
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue
                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
                                              "data: ")
                        if chunk == "[DONE]":
                            latency = time.perf_counter() - st
                        else:
                            timestamp = time.perf_counter()
                            data = json.loads(chunk)
                            delta = data["choices"][0]["delta"]
                            if delta.get("content", None):
                                # First token
                                if ttft == 0.0:
                                    ttft = time.perf_counter() - st
                                    output.ttft = ttft
                                # Decoding phase
                                else:
                                    output.itl.append(timestamp -
                                                      most_recent_timestamp)
                                generated_text += delta["content"]
                            most_recent_timestamp = timestamp
                    output.generated_text = generated_text
                    output.success = True
                    output.latency = latency
                else:
                    output.error = response.reason or ""
                    output.success = False
        except Exception:
            output.success = False
            exc_info = sys.exc_info()
            output.error = "".join(traceback.format_exception(*exc_info))
    if pbar:
        pbar.update(1)
    return output
 # Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
 # introduced in Python 3.9
 def remove_prefix(text: str, prefix: str) -> str:
    if text.startswith(prefix):
        return text[len(prefix):]
    return text
 ASYNC_REQUEST_FUNCS = {
    "tgi": async_request_tgi,
-    "vllm": async_request_vllm,
+    "vllm": async_request_openai_completions,
    "lmdeploy": async_request_openai_completions,
    "deepspeed-mii": async_request_deepspeed_mii,
    "openai": async_request_openai_completions,
    "openai-chat": async_request_openai_chat_completions,
    "tensorrt-llm": async_request_trt_llm,
 }
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -9,6 +9,7 @@ import torch
 from tqdm import tqdm
 from vllm import LLM, SamplingParams
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 def main(args: argparse.Namespace):
@@ -16,17 +17,20 @@ def main(args: argparse.Namespace):
    # NOTE(woosuk): If the request cannot be processed in a single batch,
    # the engine will automatically process the request in multiple batches.
-    llm = LLM(
+    llm = LLM(model=args.model,
-        model=args.model,
+              tokenizer=args.tokenizer,
-        tokenizer=args.tokenizer,
+              quantization=args.quantization,
-        quantization=args.quantization,
+              tensor_parallel_size=args.tensor_parallel_size,
-        tensor_parallel_size=args.tensor_parallel_size,
+              trust_remote_code=args.trust_remote_code,
-        trust_remote_code=args.trust_remote_code,
+              dtype=args.dtype,
-        dtype=args.dtype,
+              enforce_eager=args.enforce_eager,
-        enforce_eager=args.enforce_eager,
+              kv_cache_dtype=args.kv_cache_dtype,
-        kv_cache_dtype=args.kv_cache_dtype,
+              quantization_param_path=args.quantization_param_path,
-        device=args.device,
+              device=args.device,
-    )
+              ray_workers_use_nsight=args.ray_workers_use_nsight,
              enable_chunked_prefill=args.enable_chunked_prefill,
              download_dir=args.download_dir,
              block_size=args.block_size)
    sampling_params = SamplingParams(
        n=args.n,
@@ -65,7 +69,8 @@ def main(args: argparse.Namespace):
            return latency
    print("Warming up...")
-    run_to_completion(profile_dir=None)
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
        run_to_completion(profile_dir=None)
    if args.profile:
        profile_dir = args.profile_result_dir
@@ -81,7 +86,12 @@ def main(args: argparse.Namespace):
    latencies = []
    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
        latencies.append(run_to_completion(profile_dir=None))
    latencies = np.array(latencies)
    percentages = [10, 25, 50, 75, 90]
    percentiles = np.percentile(latencies, percentages)
    print(f'Avg latency: {np.mean(latencies)} seconds')
    for percentage, percentile in zip(percentages, percentiles):
        print(f'{percentage}% percentile latency: {percentile} seconds')
 if __name__ == '__main__':
@@ -92,7 +102,7 @@ if __name__ == '__main__':
    parser.add_argument('--tokenizer', type=str, default=None)
    parser.add_argument('--quantization',
                        '-q',
-                        choices=['awq', 'gptq', 'squeezellm', None],
+                        choices=[*QUANTIZATION_METHODS, None],
                        default=None)
    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
    parser.add_argument('--input-len', type=int, default=32)
@@ -103,9 +113,13 @@ if __name__ == '__main__':
                        default=1,
                        help='Number of generated sequences per prompt.')
    parser.add_argument('--use-beam-search', action='store_true')
    parser.add_argument('--num-iters-warmup',
                        type=int,
                        default=10,
                        help='Number of iterations to run for warmup.')
    parser.add_argument('--num-iters',
                        type=int,
-                        default=3,
+                        default=30,
                        help='Number of iterations to run.')
    parser.add_argument('--trust-remote-code',
                        action='store_true',
@@ -125,10 +139,23 @@ if __name__ == '__main__':
    parser.add_argument(
        "--kv-cache-dtype",
        type=str,
-        choices=['auto', 'fp8_e5m2'],
+        choices=['auto', 'fp8'],
        default='auto',
        help=
-        'Data type for kv cache storage. If "auto", will use model data type.')
+        'Data type for kv cache storage. If "auto", will use model data type. '
        'FP8_E5M2 (without scaling) is only supported on cuda version greater '
        'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
        'common inference criteria.')
    parser.add_argument(
        '--quantization-param-path',
        type=str,
        default=None,
        help='Path to the JSON file containing the KV cache scaling factors. '
        'This should generally be supplied, when KV cache dtype is FP8. '
        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
        'instead supported for common inference criteria.')
    parser.add_argument(
        '--profile',
        action='store_true',
@@ -143,7 +170,26 @@ if __name__ == '__main__':
        "--device",
        type=str,
        default="cuda",
-        choices=["cuda"],
+        choices=["cuda", "cpu"],
-        help='device type for vLLM execution, supporting CUDA only currently.')
+        help='device type for vLLM execution, supporting CUDA and CPU.')
    parser.add_argument('--block-size',
                        type=int,
                        default=16,
                        help='block size of key/value cache')
    parser.add_argument(
        '--enable-chunked-prefill',
        action='store_true',
        help='If True, the prefill requests can be chunked based on the '
        'max_num_batched_tokens')
    parser.add_argument(
        "--ray-workers-use-nsight",
        action='store_true',
        help="If specified, use nsight to profile ray workers",
    )
    parser.add_argument('--download-dir',
                        type=str,
                        default=None,
                        help='directory to download and load the weights, '
                        'default to the default cache dir of huggingface')
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -0,0 +1,62 @@
 import argparse
 import time
 from vllm import LLM, SamplingParams
 PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n"  # noqa: E501
 def test_prefix(llm=None, sampling_params=None, prompts=None):
    start_time = time.time()
    llm.generate(prompts, sampling_params=sampling_params)
    end_time = time.time()
    print(f"cost time {end_time - start_time}")
 def main(args):
    llm = LLM(model=args.model,
              tokenizer_mode='auto',
              trust_remote_code=True,
              enforce_eager=True,
              use_v2_block_manager=args.use_v2_block_manager,
              tensor_parallel_size=args.tensor_parallel_size,
              enable_prefix_caching=args.enable_prefix_caching)
    num_prompts = 100
    prompts = [PROMPT] * num_prompts
    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
    print("------warm up------")
    test_prefix(
        llm=llm,
        prompts=prompts,
        sampling_params=sampling_params,
    )
    print("------start generating------")
    test_prefix(
        llm=llm,
        prompts=prompts,
        sampling_params=sampling_params,
    )
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='Benchmark the performance with or without automatic '
        'prefix caching.')
    parser.add_argument('--model',
                        type=str,
                        default='baichuan-inc/Baichuan2-13B-Chat')
    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
    parser.add_argument('--output-len', type=int, default=10)
    parser.add_argument('--enable-prefix-caching',
                        action='store_true',
                        help='enable prefix caching')
    parser.add_argument('--use-v2-block-manager',
                        action='store_true',
                        help='Use BlockSpaceMangerV2')
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -1,8 +1,8 @@
 """Benchmark online serving throughput.
 On the server side, run one of the following commands:
-    (vLLM backend)
+    vLLM OpenAI API server
-    python -m vllm.entrypoints.api_server \
+    python -m vllm.entrypoints.openai.api_server \
        --model <your_model> --swap-space 16 \
        --disable-log-requests
@@ -12,28 +12,30 @@ On the server side, run one of the following commands:
 On the client side, run:
    python benchmarks/benchmark_serving.py \
        --backend <backend> \
-        --tokenizer <your_model> --dataset <target_dataset> \
+        --model <your_model> \
-        --request-rate <request_rate>
+        --dataset-name sharegpt \
        --dataset-path <path to dataset> \
        --request-rate <request_rate> \ # By default <request_rate> is inf
        --num-prompts <num_prompts> # By default <num_prompts> is 1000
 """
 import argparse
 import asyncio
 import json
 import os
 import random
 import time
 import warnings
 from dataclasses import dataclass
 from datetime import datetime
-from typing import AsyncGenerator, List, Tuple
+from typing import AsyncGenerator, List, Optional, Tuple
 import numpy as np
 from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
                                  RequestFuncOutput)
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from backend_request_func import (
+from vllm.transformers_utils.tokenizer import get_tokenizer
    ASYNC_REQUEST_FUNCS,
    RequestFuncInput,
    RequestFuncOutput,
 )
@dataclass
@@ -52,11 +54,15 @@ class BenchmarkMetrics:
    p99_tpot_ms: float
-def sample_requests(
+def sample_sharegpt_requests(
    dataset_path: str,
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
    fixed_output_len: Optional[int] = None,
 ) -> List[Tuple[str, int, int]]:
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")
    # Load the dataset.
    with open(dataset_path) as f:
        dataset = json.load(f)
@@ -66,37 +72,101 @@ def sample_requests(
    dataset = [(data["conversations"][0]["value"],
                data["conversations"][1]["value"]) for data in dataset]
-    # some of these will be filtered out, so sample more than we need
+    # Shuffle the dataset.
-    sampled_indices = random.sample(range(len(dataset)),
+    random.shuffle(dataset)
                                    int(num_requests * 1.2))
    dataset = [dataset[i] for i in sampled_indices]
-    # Tokenize the prompts and completions.
+    # Filter out sequences that are too long or too short
    prompts = [prompt for prompt, _ in dataset]
    prompt_token_ids = tokenizer(prompts).input_ids
    completions = [completion for _, completion in dataset]
    completion_token_ids = tokenizer(completions).input_ids
    tokenized_dataset = []
    for i in range(len(dataset)):
        output_len = len(completion_token_ids[i])
        tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
    # Filter out too long sequences.
    filtered_dataset: List[Tuple[str, int, int]] = []
-    for prompt, prompt_token_ids, output_len in tokenized_dataset:
+    for i in range(len(dataset)):
        if len(filtered_dataset) == num_requests:
            break
        # Tokenize the prompts and completions.
        prompt = dataset[i][0]
        prompt_token_ids = tokenizer(prompt).input_ids
        completion = dataset[i][1]
        completion_token_ids = tokenizer(completion).input_ids
        prompt_len = len(prompt_token_ids)
        output_len = len(completion_token_ids
                         ) if fixed_output_len is None else fixed_output_len
        if prompt_len < 4 or output_len < 4:
            # Prune too short sequences.
            # This is because TGI causes errors when the input or output length
            # is too short.
            continue
        if prompt_len > 1024 or prompt_len + output_len > 2048:
            # Prune too long sequences.
            continue
        filtered_dataset.append((prompt, prompt_len, output_len))
-    # Sample the requests.
+    return filtered_dataset
-    sampled_requests = random.sample(filtered_dataset, num_requests)
+
 def sample_sonnet_requests(
    dataset_path: str,
    num_requests: int,
    input_len: int,
    output_len: int,
    prefix_len: int,
    tokenizer: PreTrainedTokenizerBase,
 ) -> List[Tuple[str, str, int, int]]:
    assert (
        input_len > prefix_len
    ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
    # Load the dataset.
    with open(dataset_path) as f:
        poem_lines = f.readlines()
    # Tokenize the poem lines.
    poem_token_ids = tokenizer(poem_lines).input_ids
    average_poem_len = sum(
        len(token_ids) for token_ids in poem_token_ids) / len(poem_token_ids)
    # Base prefix for all requests.
    base_prompt = "Pick as many lines as you can from these poem lines:\n"
    base_message = [{
        "role": "user",
        "content": base_prompt,
    }]
    base_prompt_formatted = tokenizer.apply_chat_template(
        base_message, add_generation_prompt=True, tokenize=False)
    base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids)
    assert (
        input_len > base_prompt_offset
    ), f"Please set 'args.sonnet-input-len' higher than {base_prompt_offset}."
    num_input_lines = round(
        (input_len - base_prompt_offset) / average_poem_len)
    # First approximately `prefix_len` number of tokens in the
    # prompt are fixed poem lines.
    assert (
        prefix_len > base_prompt_offset
    ), f"Please set 'args.sonnet-prefix-len' higher than {base_prompt_offset}."
    num_prefix_lines = round(
        (prefix_len - base_prompt_offset) / average_poem_len)
    prefix_lines = poem_lines[:num_prefix_lines]
    # Sample the rest of lines per request.
    sampled_requests: List[Tuple[str, int, int]] = []
    for _ in range(num_requests):
        sampled_lines = "".join(
            prefix_lines +
            random.sample(poem_lines, num_input_lines - num_prefix_lines))
        prompt = f"{base_prompt}{sampled_lines}"
        message = [
            {
                "role": "user",
                "content": prompt,
            },
        ]
        prompt_formatted = tokenizer.apply_chat_template(
            message, add_generation_prompt=True, tokenize=False)
        prompt_len = len(tokenizer(prompt_formatted).input_ids)
        sampled_requests.append(
            (prompt, prompt_formatted, prompt_len, output_len))
    return sampled_requests
@@ -122,37 +192,42 @@ def calculate_metrics(
    outputs: List[RequestFuncOutput],
    dur_s: float,
    tokenizer: PreTrainedTokenizerBase,
-) -> BenchmarkMetrics:
+) -> Tuple[BenchmarkMetrics, List[int]]:
-    total_output = 0
+    actual_output_lens = []
    total_input = 0
    completed = 0
-    per_token_latencies = []
+    tpots = []
    ttfts = []
    for i in range(len(outputs)):
        if outputs[i].success:
-            output_len = len(tokenizer.encode(outputs[i].generated_text))
+            output_len = len(tokenizer(outputs[i].generated_text).input_ids)
-            total_output += output_len
+            actual_output_lens.append(output_len)
            total_input += input_requests[i][1]
-            per_token_latencies.append(outputs[i].latency / output_len)
+            if output_len > 1:
                tpots.append(
                    (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
            ttfts.append(outputs[i].ttft)
            completed += 1
        else:
            actual_output_lens.append(0)
    metrics = BenchmarkMetrics(
        completed=completed,
        total_input=total_input,
-        total_output=total_output,
+        total_output=sum(actual_output_lens),
        request_throughput=completed / dur_s,
        input_throughput=total_input / dur_s,
-        output_throughput=total_output / dur_s,
+        output_throughput=sum(actual_output_lens) / dur_s,
-        mean_ttft_ms=np.mean(ttfts) * 1000,
+        mean_ttft_ms=np.mean(ttfts or 0) *
-        median_ttft_ms=np.median(ttfts) * 1000,
+        1000,  # ttfts is empty if streaming is not supported by backend
-        p99_ttft_ms=np.percentile(ttfts, 99) * 1000,
+        median_ttft_ms=np.median(ttfts or 0) * 1000,
-        mean_tpot_ms=np.mean(per_token_latencies) * 1000,
+        p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
-        median_tpot_ms=np.median(per_token_latencies) * 1000,
+        mean_tpot_ms=np.mean(tpots) * 1000,
-        p99_tpot_ms=np.percentile(per_token_latencies, 99) * 1000,
+        median_tpot_ms=np.median(tpots) * 1000,
        p99_tpot_ms=np.percentile(tpots, 99) * 1000,
    )
-    return metrics
+    return metrics, actual_output_lens
 async def benchmark(
@@ -171,10 +246,10 @@ async def benchmark(
    else:
        raise ValueError(f"Unknown backend: {backend}")
    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
    print(f"Traffic request rate: {request_rate}")
    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
    benchmark_start_time = time.perf_counter()
    tasks = []
    async for request in get_request(input_requests, request_rate):
@@ -192,40 +267,53 @@ async def benchmark(
            asyncio.create_task(
                request_func(request_func_input=request_func_input,
                             pbar=pbar)))
-    outputs = await asyncio.gather(*tasks)
+    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
    if not disable_tqdm:
        pbar.close()
    benchmark_duration = time.perf_counter() - benchmark_start_time
-    metrics = calculate_metrics(
+    metrics, actual_output_lens = calculate_metrics(
        input_requests=input_requests,
        outputs=outputs,
        dur_s=benchmark_duration,
        tokenizer=tokenizer,
    )
-    print(f"Successful requests: {metrics.completed}")
+    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
-    print(f"Benchmark duration: {benchmark_duration:2f} s")
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
-    print(f"Total input tokens: {metrics.total_input}")
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
-    print(f"Total generated tokens: {metrics.total_output}")
+                                    benchmark_duration))
-    print(f"Request throughput: {metrics.request_throughput:.2f} requests/s")
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
-    print(f"Input token throughput: {metrics.input_throughput:.2f} tokens/s")
+    print("{:<40} {:<10}".format("Total generated tokens:",
-    print(f"Output token throughput: {metrics.output_throughput:.2f} tokens/s")
+                                 metrics.total_output))
-    print(f"Mean TTFT: {metrics.mean_ttft_ms:.2f} ms")
+    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
-    print(f"Median TTFT: {metrics.median_ttft_ms:.2f} ms")
+                                    metrics.request_throughput))
-    print(f"P99 TTFT: {metrics.p99_ttft_ms:.2f} ms")
+    print("{:<40} {:<10.2f}".format("Input token throughput (tok/s):",
-    print(f"Mean TPOT: {metrics.mean_tpot_ms:.2f} ms")
+                                    metrics.input_throughput))
-    print(f"Median TPOT: {metrics.median_tpot_ms:.2f} ms")
+    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
-    print(f"P99 TPOT: {metrics.p99_tpot_ms:.2f} ms")
+                                    metrics.output_throughput))
    print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-'))
    print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
    print("{:<40} {:<10.2f}".format("Median TTFT (ms):",
                                    metrics.median_ttft_ms))
    print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
    print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)',
                               n=50,
                               c='-'))
    print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
    print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
                                    metrics.median_tpot_ms))
    print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
    print("=" * 50)
    result = {
        "duration": benchmark_duration,
        "completed": metrics.completed,
        "total_input_tokens": metrics.total_input,
        "total_output_tokens": metrics.total_output,
-        "request_inthroughput": metrics.request_throughput,
+        "request_throughput": metrics.request_throughput,
        "input_throughput": metrics.input_throughput,
        "output_throughput": metrics.output_throughput,
        "mean_ttft_ms": metrics.mean_ttft_ms,
@@ -233,7 +321,13 @@ async def benchmark(
        "p99_ttft_ms": metrics.p99_ttft_ms,
        "mean_tpot_ms": metrics.mean_tpot_ms,
        "median_tpot_ms": metrics.median_tpot_ms,
-        "p99_tpot_ms": metrics.p99_tpot_ms
+        "p99_tpot_ms": metrics.p99_tpot_ms,
        "input_lens": [output.prompt_len for output in outputs],
        "output_lens": actual_output_lens,
        "ttfts": [output.ttft for output in outputs],
        "itls": [output.itl for output in outputs],
        "generated_texts": [output.generated_text for output in outputs],
        "errors": [output.error for output in outputs],
    }
    return result
@@ -254,7 +348,60 @@ def main(args: argparse.Namespace):
    tokenizer = get_tokenizer(tokenizer_id,
                              trust_remote_code=args.trust_remote_code)
-    input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
+
    if args.dataset is not None:
        warnings.warn(
            "The '--dataset' argument will be deprecated in the next "
            "release. Please use '--dataset-name' and "
            "'--dataset-path' in the future runs.",
            stacklevel=2)
        input_requests = sample_sharegpt_requests(
            dataset_path=args.dataset,
            num_requests=args.num_prompts,
            tokenizer=tokenizer,
            fixed_output_len=args.sharegpt_output_len,
        )
    elif args.dataset_name == "sharegpt":
        input_requests = sample_sharegpt_requests(
            dataset_path=args.dataset_path,
            num_requests=args.num_prompts,
            tokenizer=tokenizer,
            fixed_output_len=args.sharegpt_output_len,
        )
    elif args.dataset_name == "sonnet":
        # Do not format the prompt, pass to message directly
        if args.backend == "openai-chat":
            input_requests = sample_sonnet_requests(
                dataset_path=args.dataset_path,
                num_requests=args.num_prompts,
                input_len=args.sonnet_input_len,
                output_len=args.sonnet_output_len,
                prefix_len=args.sonnet_prefix_len,
                tokenizer=tokenizer,
            )
            input_requests = [(prompt, prompt_len, output_len)
                              for prompt, prompt_formatted, prompt_len,
                              output_len in input_requests]
        else:
            assert (
                tokenizer.chat_template or tokenizer.default_chat_template
            ), "Tokenizer/model must have chat template for sonnet dataset."
            input_requests = sample_sonnet_requests(
                dataset_path=args.dataset_path,
                num_requests=args.num_prompts,
                input_len=args.sonnet_input_len,
                output_len=args.sonnet_output_len,
                prefix_len=args.sonnet_prefix_len,
                tokenizer=tokenizer,
            )
            input_requests = [(prompt_formatted, prompt_len, output_len)
                              for prompt, prompt_formatted, prompt_len,
                              output_len in input_requests]
    else:
        raise ValueError(f"Unknown dataset: {args.dataset_name}")
    benchmark_result = asyncio.run(
        benchmark(
@@ -277,13 +424,23 @@ def main(args: argparse.Namespace):
        current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
        result_json["date"] = current_dt
        result_json["backend"] = backend
        result_json["version"] = args.version
        result_json["model_id"] = model_id
        result_json["tokenizer_id"] = tokenizer_id
        result_json["best_of"] = args.best_of
        result_json["use_beam_search"] = args.use_beam_search
        result_json["num_prompts"] = args.num_prompts
        # Metadata
        if args.metadata:
            for item in args.metadata:
                if "=" in item:
                    kvstring = item.split("=")
                    result_json[kvstring[0].strip()] = kvstring[1].strip()
                else:
                    raise ValueError(
                        "Invalid metadata format. Please use KEY=VALUE format."
                    )
        # Traffic
        result_json["request_rate"] = (
            args.request_rate if args.request_rate < float("inf") else "inf")
@@ -293,7 +450,9 @@ def main(args: argparse.Namespace):
        # Save to file
        base_model_id = model_id.split("/")[-1]
-        file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
+        file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"  #noqa
        if args.result_dir:
            file_name = os.path.join(args.result_dir, file_name)
        with open(file_name, "w") as outfile:
            json.dump(result_json, outfile)
@@ -307,12 +466,6 @@ if __name__ == "__main__":
        default="vllm",
        choices=list(ASYNC_REQUEST_FUNCS.keys()),
    )
    parser.add_argument(
        "--version",
        type=str,
        default="N/A",
        help="Version of the serving backend/engine.",
    )
    parser.add_argument(
        "--base-url",
        type=str,
@@ -324,12 +477,26 @@ if __name__ == "__main__":
    parser.add_argument(
        "--endpoint",
        type=str,
-        default="/generate",
+        default="/v1/completions",
        help="API endpoint.",
    )
-    parser.add_argument("--dataset",
+    parser.add_argument(
        "--dataset",
        type=str,
        default=None,
        help="Path to the ShareGPT dataset, will be deprecated in the "
        "next release.",
    )
    parser.add_argument(
        "--dataset-name",
        type=str,
        default="sharegpt",
        choices=["sharegpt", "sonnet"],
        help="Name of the dataset to benchmark on.",
    )
    parser.add_argument("--dataset-path",
                        type=str,
-                        required=True,
+                        default=None,
                        help="Path to the dataset.")
    parser.add_argument(
        "--model",
@@ -341,7 +508,7 @@ if __name__ == "__main__":
        "--tokenizer",
        type=str,
        help=
-        "Name or path of the tokenizer, if not using the default model tokenizer.",
+        "Name or path of the tokenizer, if not using the default tokenizer.",
    )
    parser.add_argument(
        "--best-of",
@@ -357,6 +524,33 @@ if __name__ == "__main__":
        default=1000,
        help="Number of prompts to process.",
    )
    parser.add_argument(
        "--sharegpt-output-len",
        type=int,
        default=None,
        help="Output length for each request. Overrides the output length "
        "from the ShareGPT dataset.")
    parser.add_argument(
        "--sonnet-input-len",
        type=int,
        default=550,
        help=
        "Number of input tokens per request, used only for sonnet dataset.",
    )
    parser.add_argument(
        "--sonnet-output-len",
        type=int,
        default=150,
        help=
        "Number of output tokens per request, used only for sonnet dataset.",
    )
    parser.add_argument(
        "--sonnet-prefix-len",
        type=int,
        default=200,
        help=
        "Number of prefix tokens per request, used only for sonnet dataset.",
    )
    parser.add_argument(
        "--request-rate",
        type=float,
@@ -382,6 +576,21 @@ if __name__ == "__main__":
        action="store_true",
        help="Specify to save benchmark results to a json file",
    )
    parser.add_argument(
        "--metadata",
        metavar="KEY=VALUE",
        nargs="*",
        help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
        "for metadata of this run to be saved in the result JSON file "
        "for record keeping purposes.",
    )
    parser.add_argument(
        "--result-dir",
        type=str,
        default=None,
        help="Specify directory to save benchmark json results."
        "If not specified, results are saved in the current directory.",
    )
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -6,9 +6,11 @@ import time
 from typing import List, Optional, Tuple
 import torch
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          PreTrainedTokenizerBase)
-from tqdm import tqdm
+
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 def sample_requests(
@@ -29,22 +31,23 @@ def sample_requests(
    dataset = [(data["conversations"][0]["value"],
                data["conversations"][1]["value"]) for data in dataset]
-    # Tokenize the prompts and completions.
+    # Shuffle the dataset.
-    prompts = [prompt for prompt, _ in dataset]
+    random.shuffle(dataset)
    prompt_token_ids = tokenizer(prompts).input_ids
    completions = [completion for _, completion in dataset]
    completion_token_ids = tokenizer(completions).input_ids
    tokenized_dataset = []
    for i in range(len(dataset)):
        output_len = len(completion_token_ids[i])
        if fixed_output_len is not None:
            output_len = fixed_output_len
        tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
-    # Filter out too long sequences.
+    # Filter out sequences that are too long or too short
    filtered_dataset: List[Tuple[str, int, int]] = []
-    for prompt, prompt_token_ids, output_len in tokenized_dataset:
+    for i in range(len(dataset)):
        if len(filtered_dataset) == num_requests:
            break
        # Tokenize the prompts and completions.
        prompt = dataset[i][0]
        prompt_token_ids = tokenizer(prompt).input_ids
        completion = dataset[i][1]
        completion_token_ids = tokenizer(completion).input_ids
        prompt_len = len(prompt_token_ids)
        output_len = len(completion_token_ids
                         ) if fixed_output_len is None else fixed_output_len
        if prompt_len < 4 or output_len < 4:
            # Prune too short sequences.
            continue
@@ -53,9 +56,7 @@ def sample_requests(
            continue
        filtered_dataset.append((prompt, prompt_len, output_len))
-    # Sample the requests.
+    return filtered_dataset
    sampled_requests = random.sample(filtered_dataset, num_requests)
    return sampled_requests
 def run_vllm(
@@ -72,7 +73,13 @@ def run_vllm(
    max_model_len: Optional[int],
    enforce_eager: bool,
    kv_cache_dtype: str,
    quantization_param_path: Optional[str],
    device: str,
    enable_prefix_caching: bool,
    enable_chunked_prefill: bool,
    max_num_batched_tokens: int,
    gpu_memory_utilization: float = 0.9,
    download_dir: Optional[str] = None,
 ) -> float:
    from vllm import LLM, SamplingParams
    llm = LLM(
@@ -84,31 +91,34 @@ def run_vllm(
        trust_remote_code=trust_remote_code,
        dtype=dtype,
        max_model_len=max_model_len,
        gpu_memory_utilization=gpu_memory_utilization,
        enforce_eager=enforce_eager,
        kv_cache_dtype=kv_cache_dtype,
        quantization_param_path=quantization_param_path,
        device=device,
        enable_prefix_caching=enable_prefix_caching,
        download_dir=download_dir,
        enable_chunked_prefill=enable_chunked_prefill,
        max_num_batched_tokens=max_num_batched_tokens,
    )
    # Add the requests to the engine.
    prompts = []
    sampling_params = []
    for prompt, _, output_len in requests:
-        sampling_params = SamplingParams(
+        prompts.append(prompt)
-            n=n,
+        sampling_params.append(
-            temperature=0.0 if use_beam_search else 1.0,
+            SamplingParams(
-            top_p=1.0,
+                n=n,
-            use_beam_search=use_beam_search,
+                temperature=0.0 if use_beam_search else 1.0,
-            ignore_eos=True,
+                top_p=1.0,
-            max_tokens=output_len,
+                use_beam_search=use_beam_search,
-        )
+                ignore_eos=True,
-        # FIXME(woosuk): Do not use internal method.
+                max_tokens=output_len,
-        llm._add_request(
+            ))
            prompt=prompt,
            prompt_token_ids=None,
            sampling_params=sampling_params,
        )
    start = time.perf_counter()
-    # FIXME(woosuk): Do not use internal method.
+    llm.generate(prompts, sampling_params, use_tqdm=True)
    llm._run_engine(use_tqdm=True)
    end = time.perf_counter()
    return end - start
@@ -179,13 +189,15 @@ def run_mii(
    tensor_parallel_size: int,
    output_len: int,
 ) -> float:
-    from mii import pipeline
+    from mii import client, serve
-    llm = pipeline(model, tensor_parallel=tensor_parallel_size)
+    llm = serve(model, tensor_parallel=tensor_parallel_size)
    prompts = [prompt for prompt, _, _ in requests]
    start = time.perf_counter()
-    llm(prompts, max_new_tokens=output_len)
+    llm.generate(prompts, max_new_tokens=output_len)
    end = time.perf_counter()
    client = client(model)
    client.terminate_server()
    return end - start
@@ -206,12 +218,15 @@ def main(args: argparse.Namespace):
                                   args.output_len)
    if args.backend == "vllm":
-        elapsed_time = run_vllm(requests, args.model, args.tokenizer,
+        elapsed_time = run_vllm(
-                                args.quantization, args.tensor_parallel_size,
+            requests, args.model, args.tokenizer, args.quantization,
-                                args.seed, args.n, args.use_beam_search,
+            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
-                                args.trust_remote_code, args.dtype,
+            args.trust_remote_code, args.dtype, args.max_model_len,
-                                args.max_model_len, args.enforce_eager,
+            args.enforce_eager, args.kv_cache_dtype,
-                                args.kv_cache_dtype, args.device)
+            args.quantization_param_path, args.device,
            args.enable_prefix_caching, args.enable_chunked_prefill,
            args.max_num_batched_tokens, args.gpu_memory_utilization,
            args.download_dir)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -251,7 +266,7 @@ if __name__ == "__main__":
    parser.add_argument("--tokenizer", type=str, default=None)
    parser.add_argument('--quantization',
                        '-q',
-                        choices=['awq', 'gptq', 'squeezellm', None],
+                        choices=[*QUANTIZATION_METHODS, None],
                        default=None)
    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
    parser.add_argument("--n",
@@ -286,22 +301,58 @@ if __name__ == "__main__":
        'The "auto" option will use FP16 precision '
        'for FP32 and FP16 models, and BF16 precision '
        'for BF16 models.')
    parser.add_argument('--gpu-memory-utilization',
                        type=float,
                        default=0.9,
                        help='the fraction of GPU memory to be used for '
                        'the model executor, which can range from 0 to 1.'
                        'If unspecified, will use the default value of 0.9.')
    parser.add_argument("--enforce-eager",
                        action="store_true",
                        help="enforce eager execution")
    parser.add_argument(
        "--kv-cache-dtype",
        type=str,
-        choices=["auto", "fp8_e5m2"],
+        choices=["auto", "fp8"],
        default="auto",
        help=
-        'Data type for kv cache storage. If "auto", will use model data type.')
+        'Data type for kv cache storage. If "auto", will use model data type. '
        'FP8_E5M2 (without scaling) is only supported on cuda version greater '
        'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
        'common inference criteria.')
    parser.add_argument(
        '--quantization-param-path',
        type=str,
        default=None,
        help='Path to the JSON file containing the KV cache scaling factors. '
        'This should generally be supplied, when KV cache dtype is FP8. '
        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
        'instead supported for common inference criteria.')
    parser.add_argument(
        "--device",
        type=str,
        default="cuda",
-        choices=["cuda"],
+        choices=["cuda", "cpu"],
-        help='device type for vLLM execution, supporting CUDA only currently.')
+        help='device type for vLLM execution, supporting CUDA and CPU.')
    parser.add_argument(
        "--enable-prefix-caching",
        action='store_true',
        help="enable automatic prefix caching for vLLM backend.")
    parser.add_argument("--enable-chunked-prefill",
                        action='store_true',
                        help="enable chunked prefill for vLLM backend.")
    parser.add_argument('--max-num-batched-tokens',
                        type=int,
                        default=None,
                        help='maximum number of batched tokens per '
                        'iteration')
    parser.add_argument('--download-dir',
                        type=str,
                        default=None,
                        help='directory to download and load the weights, '
                        'default to the default cache dir of huggingface')
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -0,0 +1,302 @@
 import argparse
 import os
 import sys
 from typing import Optional
 import torch
 import torch.nn.functional as F
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.aqlm import (
    dequantize_weight, generic_dequantize_gemm, get_int_dtype,
    optimized_dequantize_gemm)
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 def torch_mult(
        input: torch.Tensor,  #  [..., in_features]
        weights: torch.Tensor,
        scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
 ) -> torch.Tensor:
    output = F.linear(input, weights)
    return output
 def dequant_out_scale(
    input: torch.Tensor,  #  [..., in_features]
    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
    codebooks: torch.
    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
    output_partition_sizes: torch.IntTensor,
    bias: Optional[torch.Tensor],
 ) -> torch.Tensor:
    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
    if bias is None:
        output = F.linear(input, weights, bias)
        orig_shape = output.shape
        flattened_output = output.view(-1, output.size(-1))
        f_scales = scales.view(-1, scales.shape[0])
        b_scales = f_scales.expand(flattened_output.shape[0], -1)
        flattened_output *= b_scales
        return flattened_output.view(orig_shape)
    else:
        b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
            -1, weights.shape[1])
        weights *= b_scales
        return F.linear(input, weights, bias)
 def dequant_weight_scale(
    input: torch.Tensor,  #  [..., in_features]
    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
    codebooks: torch.
    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
    output_partition_sizes: torch.IntTensor,
    bias: Optional[torch.Tensor],
 ) -> torch.Tensor:
    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
    b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
        -1, weights.shape[1])
    weights *= b_scales
    return F.linear(input, weights, bias)
 def dequant_no_scale(
    input: torch.Tensor,  #  [..., in_features]
    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
    codebooks: torch.
    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
    output_partition_sizes: torch.IntTensor,
    bias: Optional[torch.Tensor],
 ) -> torch.Tensor:
    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
    return F.linear(input, weights, bias)
 # Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
 # the generic pytorch version.
 # Just visual comparison.
 def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
    n = parts.sum().item()
    device = torch.device('cuda:0')
    code_range = (1 << bits) // 2
    ingroups = 8
    codes = torch.randint(-code_range,
                          code_range,
                          size=(n, k // ingroups, nbooks),
                          dtype=get_int_dtype(bits),
                          device=device)
    codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
                            dtype=torch.float16,
                            device=device)
    count = 0
    for index in range(16):
        for i in range(8):
            for book in range(nbooks):
                codebooks[book, index, 0, i] = count * (10**book)
            count += 1
    print("codes shape", codes.shape)
    for i in range(16):
        for book in range(nbooks):
            codes[0, i, book] = i
            codes[0, -i, book] = i
    weights = dequantize_weight(codes, codebooks, None)
    weights2 = ops.aqlm_dequant(codes, codebooks, parts)
    print("weights shape:", weights.shape)
    print("weights2 shape:", weights2.shape)
    print("weights are:", weights)
    print("weights2 are:", weights2)
    print("first 128 weights are", weights[0, 0:128].to(torch.int32))
    print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
    print("last 128 weights are", weights[0, -128:])
    print("last 128 weights2 are:", weights2[0, -128:])
 def main():
    parser = argparse.ArgumentParser(description="Benchmark aqlm performance.")
    # Add arguments
    parser.add_argument("--nbooks",
                        type=int,
                        default=1,
                        help="Number of codebooks (default: 1)")
    parser.add_argument("--bits",
                        type=int,
                        default=16,
                        help="Number of bits per code element (default: 16)")
    parser.add_argument(
        "--test",
        type=bool,
        default=False,
        help="Run the decompression/dequant tester rather than benchmarking "
        "(default: False)")
    # Parse the arguments
    args = parser.parse_args()
    # Extract values
    nbooks = args.nbooks
    bits = args.bits
    if args.test:
        dequant_test(4096, torch.tensor((4096, )), nbooks, bits)
        return
    # Otherwise, benchmark.
    methods = [
        ops.aqlm_gemm,
        dequant_out_scale,
        generic_dequantize_gemm,
        optimized_dequantize_gemm,
        dequant_weight_scale,
        torch_mult,
        dequant_no_scale,
    ]
    filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
    print(f"writing benchmarks to file {filename}")
    with open(filename, "w") as f:
        sys.stdout = f
        print('m | k | n | n parts', end='')
        for method in methods:
            print(f" | {method.__name__.replace('_', ' ')} (µs)", end='')
        print('')
        # These are reasonable prefill sizes.
        ksandpartions = ((4096, (4096, 4096, 4096)), (4096, (4096, )),
                         (4096, (11008, 11008)), (11008, (4096, )))
        # reasonable ranges for m.
        for m in [
                1, 2, 4, 8, 10, 12, 14, 16, 24, 32, 48, 52, 56, 64, 96, 112,
                128, 256, 512, 1024, 1536, 2048, 3072, 4096
        ]:
            print(f'{m}', file=sys.__stdout__)
            for ksp in ksandpartions:
                run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits,
                         methods)
        sys.stdout = sys.__stdout__
 def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
             methods):
    # I didn't see visible improvements from increasing these, but feel free :)
    num_warmup_trials = 1
    num_trials = 1
    num_calls = 100
    # warmup.
    for method in methods:
        for _ in range(num_warmup_trials):
            run_timing(
                num_calls=num_calls,
                m=m,
                k=k,
                parts=parts,
                nbooks=nbooks,
                bits=bits,
                method=method,
            )
    n = parts.sum().item()
    print(f'{m} | {k} | {n} | {parts.tolist()}', end='')
    for method in methods:
        best_time_us = 1e20
        for _ in range(num_trials):
            kernel_dur_ms = run_timing(
                num_calls=num_calls,
                m=m,
                k=k,
                parts=parts,
                nbooks=nbooks,
                bits=bits,
                method=method,
            )
            kernel_dur_us = 1000 * kernel_dur_ms
            if kernel_dur_us < best_time_us:
                best_time_us = kernel_dur_us
        print(f' | {kernel_dur_us:.0f}', end='')
    print('')
 def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
               nbooks: int, bits: int, method) -> float:
    n = parts.sum().item()
    device = torch.device('cuda:0')
    input = torch.randn((1, m, k), dtype=torch.float16, device=device)
    code_range = (1 << bits) // 2
    ingroups = 8
    codes = torch.randint(-code_range,
                          code_range,
                          size=(n, k // ingroups, nbooks),
                          dtype=get_int_dtype(bits),
                          device=device)
    codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
                            dtype=torch.float16,
                            device=device)
    scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
    # for comparison to just a pytorch mult.
    weights = torch.randn((n, k), dtype=torch.float16, device=device)
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)
    start_event.record()
    if method is torch_mult:
        for i in range(num_calls):
            torch_mult(input, weights, scales)
    else:
        for i in range(num_calls):
            method(input, codes, codebooks, scales, parts, None)
    end_event.record()
    end_event.synchronize()
    dur_ms = start_event.elapsed_time(end_event) / num_calls
    return dur_ms
 if __name__ == "__main__":
    sys.exit(main())
--- a/benchmarks/kernels/benchmark_mixtral_moe.py
+++ b/benchmarks/kernels/benchmark_mixtral_moe.py
@@ -1,25 +1,29 @@
 import argparse
 import json
 import os
 import sys
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 from vllm.model_executor.layers.fused_moe import fused_moe
 import torch
 import torch.nn.functional as F
 import triton
 from tqdm import tqdm
 from vllm.model_executor.layers.fused_moe import (fused_moe,
                                                  get_config_file_name)
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
-def main():
+def main(dtype: str):
    method = fused_moe
    for bs in [
            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
            2048, 3072, 4096
    ]:
-        run_grid(bs, method=method)
+        run_grid(bs, method=method, dtype=dtype)
-def run_grid(bs, method):
+def run_grid(bs, method, dtype: str):
    d_model = 4096
    num_total_experts = 8
    top_k = 2
@@ -32,39 +36,29 @@ def run_grid(bs, method):
    num_trials = 1
    configs = []
    if bs <= 16:
        BLOCK_SIZES_M = [16]
    elif bs <= 32:
        BLOCK_SIZES_M = [16, 32]
    elif bs <= 64:
        BLOCK_SIZES_M = [16, 32, 64]
    elif bs <= 128:
        BLOCK_SIZES_M = [16, 32, 64, 128]
    else:
        BLOCK_SIZES_M = [16, 32, 64, 128, 256]
    for block_size_n in [32, 64, 128, 256]:
-        for block_size_m in BLOCK_SIZES_M:
+        for block_size_m in [16, 32, 64, 128, 256]:
            for block_size_k in [64, 128, 256]:
                for group_size_m in [1, 16, 32, 64]:
                    for num_warps in [4, 8]:
-                        configs.append({
+                        for num_stages in [2, 3, 4, 5]:
-                            "BLOCK_SIZE_M": block_size_m,
+                            configs.append({
-                            "BLOCK_SIZE_N": block_size_n,
+                                "BLOCK_SIZE_M": block_size_m,
-                            "BLOCK_SIZE_K": block_size_k,
+                                "BLOCK_SIZE_N": block_size_n,
-                            "GROUP_SIZE_M": group_size_m,
+                                "BLOCK_SIZE_K": block_size_k,
-                            "num_warps": num_warps,
+                                "GROUP_SIZE_M": group_size_m,
-                            "num_stages": 4,
+                                "num_warps": num_warps,
-                        })
+                                "num_stages": num_stages,
                            })
    best_config = None
    best_time_us = 1e20
-    for config in configs:
+    print(f'{tp_size=} {bs=}')
-        print(f'{tp_size=} {bs=}')
+
-        print(f'{config}')
+    for config in tqdm(configs):
        # warmup
        print(f'warming up')
        try:
            for _ in range(num_warmup_trials):
                run_timing(
@@ -77,12 +71,12 @@ def run_grid(bs, method):
                    model_intermediate_size=model_intermediate_size,
                    method=method,
                    config=config,
                    dtype=dtype,
                )
        except triton.runtime.autotuner.OutOfResources:
            continue
        # trial
        print(f'benchmarking')
        for _ in range(num_trials):
            kernel_dur_ms = run_timing(
                num_calls=num_calls,
@@ -94,6 +88,7 @@ def run_grid(bs, method):
                model_intermediate_size=model_intermediate_size,
                method=method,
                config=config,
                dtype=dtype,
            )
            kernel_dur_us = 1000 * kernel_dur_ms
@@ -103,42 +98,73 @@ def run_grid(bs, method):
                best_config = config
                best_time_us = kernel_dur_us
-            print(
+                tqdm.write(
-                f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f} {bs=} {tp_size=} {top_k=} {num_total_experts=} {d_model=} {model_intermediate_size=} {num_layers=}'
+                    f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}'
-            )
+                    f' {bs=} {tp_size=} {top_k=} {num_total_experts=} '
                    f'{d_model=} {model_intermediate_size=} {num_layers=}')
    print("best_time_us", best_time_us)
    print("best_config", best_config)
-    filename = "/tmp/config.jsonl"
+    # holds Dict[str, Dict[str, int]]
    filename = get_config_file_name(num_total_experts,
                                    model_intermediate_size // tp_size,
                                    "float8" if dtype == "float8" else None)
    print(f"writing config to file {filename}")
-    with open(filename, "a") as f:
+    existing_content = {}
-        f.write(json.dumps({str(bs): best_config}) + "\n")
+    if os.path.exists(filename):
        with open(filename, "r") as f:
            existing_content = json.load(f)
    existing_content[str(bs)] = best_config
    with open(filename, "w") as f:
        json.dump(existing_content, f, indent=4)
        f.write("\n")
 def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
               top_k: int, tp_size: int, model_intermediate_size: int, method,
-               config) -> float:
+               config, dtype: str) -> float:
    shard_intermediate_size = model_intermediate_size // tp_size
    hidden_states = torch.rand(
        (bs, d_model),
        device="cuda:0",
-        dtype=torch.bfloat16,
+        dtype=torch.float16,
    )
-    ws = torch.rand(
+    w1 = torch.rand(
        (num_total_experts, 2 * shard_intermediate_size, d_model),
        device=hidden_states.device,
        dtype=hidden_states.dtype,
    )
-    w2s = torch.rand(
+    w2 = torch.rand(
        (num_total_experts, d_model, shard_intermediate_size),
        device=hidden_states.device,
        dtype=hidden_states.dtype,
    )
    w1_scale = None
    w2_scale = None
    a1_scale = None
    a2_scale = None
    if dtype == "float8":
        w1 = w1.to(torch.float8_e4m3fn)
        w2 = w2.to(torch.float8_e4m3fn)
        w1_scale = torch.ones(num_total_experts,
                              device=hidden_states.device,
                              dtype=torch.float32)
        w2_scale = torch.ones(num_total_experts,
                              device=hidden_states.device,
                              dtype=torch.float32)
        a1_scale = torch.ones(1,
                              device=hidden_states.device,
                              dtype=torch.float32)
        a2_scale = torch.ones(1,
                              device=hidden_states.device,
                              dtype=torch.float32)
    gating_output = F.softmax(torch.rand(
        (num_calls, bs, num_total_experts),
        device=hidden_states.device,
@@ -153,13 +179,18 @@ def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
    for i in range(num_calls):
        hidden_states = method(
            hidden_states=hidden_states,
-            w1=ws,
+            w1=w1,
-            w2=w2s,
+            w2=w2,
            w1_scale=w1_scale,
            w2_scale=w2_scale,
            a1_scale=a1_scale,
            a2_scale=a2_scale,
            gating_output=gating_output[i],
            topk=2,
            renormalize=True,
            inplace=True,
            override_config=config,
            use_fp8=dtype == "float8",
        )
    end_event.record()
    end_event.synchronize()
@@ -169,4 +200,16 @@ def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
 if __name__ == "__main__":
-    sys.exit(main())
+    parser = argparse.ArgumentParser(
        prog='benchmark_mixtral_moe',
        description='Benchmark and tune the fused_moe kernel',
    )
    parser.add_argument(
        '--dtype',
        type=str,
        default='auto',
        choices=['float8', 'float16'],
        help='Data type used for fused_moe kernel computations',
    )
    args = parser.parse_args()
    sys.exit(main(args.dtype))
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -1,12 +1,12 @@
 from typing import Optional
 import argparse
 import random
 import time
 from typing import Optional
 import torch
 from vllm import _custom_ops as ops
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
 from vllm._C import ops
 NUM_BLOCKS = 1024
 PARTITION_SIZE = 512
@@ -16,7 +16,7 @@ PARTITION_SIZE = 512
 def main(
    version: str,
    num_seqs: int,
-    context_len: int,
+    seq_len: int,
    num_query_heads: int,
    num_kv_heads: int,
    head_size: int,
@@ -48,12 +48,12 @@ def main(
                                   dtype=torch.float,
                                   device=device)
-    context_lens = [context_len for _ in range(num_seqs)]
+    seq_lens = [seq_len for _ in range(num_seqs)]
-    max_context_len = max(context_lens)
+    max_seq_len = max(seq_lens)
-    context_lens = torch.tensor(context_lens, dtype=torch.int, device=device)
+    seq_lens = torch.tensor(seq_lens, dtype=torch.int, device=device)
    # Create the block tables.
-    max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
+    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
    block_tables = []
    for _ in range(num_seqs):
        block_table = [
@@ -77,8 +77,7 @@ def main(
    # Prepare for the paged attention kernel.
    output = torch.empty_like(query)
    if version == "v2":
-        num_partitions = ((max_context_len + PARTITION_SIZE - 1) //
+        num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
                          PARTITION_SIZE)
        tmp_output = torch.empty(
            size=(num_seqs, num_query_heads, num_partitions, head_size),
            dtype=output.dtype,
@@ -97,6 +96,9 @@ def main(
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()
        # Using default kv_scale
        kv_scale = 1.0
        for _ in range(num_iters):
            if version == "v1":
                ops.paged_attention_v1(
@@ -107,11 +109,12 @@ def main(
                    num_kv_heads,
                    scale,
                    block_tables,
-                    context_lens,
+                    seq_lens,
                    block_size,
-                    max_context_len,
+                    max_seq_len,
                    alibi_slopes,
                    kv_cache_dtype,
                    kv_scale,
                )
            elif version == "v2":
                ops.paged_attention_v2(
@@ -125,11 +128,12 @@ def main(
                    num_kv_heads,
                    scale,
                    block_tables,
-                    context_lens,
+                    seq_lens,
                    block_size,
-                    max_context_len,
+                    max_seq_len,
                    alibi_slopes,
                    kv_cache_dtype,
                    kv_scale,
                )
            else:
                raise ValueError(f"Invalid version: {version}")
@@ -161,7 +165,7 @@ if __name__ == '__main__':
                        choices=["v1", "v2"],
                        default="v2")
    parser.add_argument("--batch-size", type=int, default=8)
-    parser.add_argument("--context-len", type=int, default=4096)
+    parser.add_argument("--seq_len", type=int, default=4096)
    parser.add_argument("--num-query-heads", type=int, default=64)
    parser.add_argument("--num-kv-heads", type=int, default=8)
    parser.add_argument("--head-size",
@@ -179,11 +183,13 @@ if __name__ == '__main__':
    parser.add_argument(
        "--kv-cache-dtype",
        type=str,
-        choices=["auto", "fp8_e5m2"],
+        choices=["auto", "fp8"],
        default="auto",
        help=
-        'Data type for kv cache storage. If "auto", will use model data type.')
+        'Data type for kv cache storage. If "auto", will use model data type. '
-    parser.add_argument("--device", type=str, choices=["cuda"], default="cuda")
+        'FP8_E5M2 (without scaling) is only supported on cuda version greater '
        'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
        'common inference criteria.')
    args = parser.parse_args()
    print(args)
@@ -192,7 +198,7 @@ if __name__ == '__main__':
    main(
        version=args.version,
        num_seqs=args.batch_size,
-        context_len=args.context_len,
+        seq_len=args.seq_len,
        num_query_heads=args.num_query_heads,
        num_kv_heads=args.num_kv_heads,
        head_size=args.head_size,
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -0,0 +1,121 @@
 import argparse
 from itertools import accumulate
 from typing import Optional
 import nvtx
 import torch
 from vllm.model_executor.layers.rotary_embedding import get_rope
 def benchmark_rope_kernels_multi_lora(
    is_neox_style: bool,
    batch_size: int,
    seq_len: int,
    num_heads: int,
    head_size: int,
    rotary_dim: Optional[int],
    dtype: torch.dtype,
    seed: int,
    device: str,
    max_position: int = 8192,
    base: int = 10000,
 ) -> None:
    torch.random.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    torch.set_default_device(device)
    if rotary_dim is None:
        rotary_dim = head_size
    # silulating serving 4 LoRAs
    scaling_factors = [1, 2, 4, 8]
    # batched RoPE can take multiple scaling factors
    batched_rope = get_rope(head_size, rotary_dim, max_position, base,
                            is_neox_style, {
                                "type": "linear",
                                "factor": tuple(scaling_factors)
                            })
    # non-batched RoPE takes only one scaling factor, we create multiple
    # instances to simulate the same behavior
    non_batched_ropes = []
    for scaling_factor in scaling_factors:
        non_batched_ropes.append(
            get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
                     {
                         "type": "linear",
                         "factor": (scaling_factor, )
                     }))
    positions = torch.randint(0, max_position, (batch_size, seq_len))
    query = torch.randn(batch_size,
                        seq_len,
                        num_heads * head_size,
                        dtype=dtype)
    key = torch.randn_like(query)
    # create query offsets for batched RoPE, we concat multiple kv cache
    # together and each query needs to find the right kv cache of its type
    offset_map = torch.tensor(
        list(
            accumulate([0] + [
                max_position * scaling_factor * 2
                for scaling_factor in scaling_factors[:-1]
            ])))
    query_types = torch.randint(0,
                                len(scaling_factors), (batch_size, seq_len),
                                device=device)
    # map query types to offsets
    query_offsets = offset_map[query_types]
    # the kernel takes flattened offsets
    flatten_offsets = query_offsets.flatten()
    # batched queries of the same type together for non-batched RoPE
    queries = [query[query_types == i] for i in range(len(scaling_factors))]
    keys = [key[query_types == i] for i in range(len(scaling_factors))]
    packed_qkr = zip(queries, keys, non_batched_ropes)
    # synchronize before start timing
    torch.cuda.synchronize()
    with nvtx.annotate("non-batched", color="yellow"):
        for q, k, r in packed_qkr:
            r.forward(positions, q, k)
    torch.cuda.synchronize()
    with nvtx.annotate("batched", color="green"):
        batched_rope.forward(positions, query, key, flatten_offsets)
    torch.cuda.synchronize()
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Benchmark the rotary embedding kernels.")
    parser.add_argument("--is-neox-style", type=bool, default=True)
    parser.add_argument("--batch-size", type=int, default=16)
    parser.add_argument("--seq-len", type=int, default=512)
    parser.add_argument("--num-heads", type=int, default=8)
    parser.add_argument("--head-size",
                        type=int,
                        choices=[64, 80, 96, 112, 128, 256],
                        default=128)
    parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
    parser.add_argument("--dtype",
                        type=str,
                        choices=["bfloat16", "float"],
                        default="float")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--device",
                        type=str,
                        choices=["cuda:0", "cuda:1"],
                        default="cuda:0")
    args = parser.parse_args()
    print(args)
    benchmark_rope_kernels_multi_lora(
        is_neox_style=args.is_neox_style,
        batch_size=args.batch_size,
        seq_len=args.seq_len,
        num_heads=args.num_heads,
        head_size=args.head_size,
        rotary_dim=args.rotary_dim,
        dtype=getattr(torch, args.dtype),
        seed=args.seed,
        device=args.device,
    )
--- a/benchmarks/sonnet.txt
+++ b/benchmarks/sonnet.txt
@@ -0,0 +1,518 @@
 FROM fairest creatures we desire increase,
 That thereby beauty's rose might never die,
 But as the riper should by time decease,
 His tender heir might bear his memory:
 But thou, contracted to thine own bright eyes,
 Feed'st thy light'st flame with self-substantial fuel,
 Making a famine where abundance lies,
 Thyself thy foe, to thy sweet self too cruel.
 Thou that art now the world's fresh ornament
 And only herald to the gaudy spring,
 Within thine own bud buriest thy content
 And, tender churl, makest waste in niggarding.
 Pity the world, or else this glutton be,
 To eat the world's due, by the grave and thee.
 When forty winters shall beseige thy brow,
 And dig deep trenches in thy beauty's field,
 Thy youth's proud livery, so gazed on now,
 Will be a tatter'd weed, of small worth held:
 Then being ask'd where all thy beauty lies,
 Where all the treasure of thy lusty days,
 To say, within thine own deep-sunken eyes,
 Were an all-eating shame and thriftless praise.
 How much more praise deserved thy beauty's use,
 If thou couldst answer 'This fair child of mine
 Shall sum my count and make my old excuse,'
 Proving his beauty by succession thine!
 This were to be new made when thou art old,
 And see thy blood warm when thou feel'st it cold.
 Look in thy glass, and tell the face thou viewest
 Now is the time that face should form another;
 Whose fresh repair if now thou not renewest,
 Thou dost beguile the world, unbless some mother.
 For where is she so fair whose unear'd womb
 Disdains the tillage of thy husbandry?
 Or who is he so fond will be the tomb
 Of his self-love, to stop posterity?
 Thou art thy mother's glass, and she in thee
 Calls back the lovely April of her prime:
 So thou through windows of thine age shall see
 Despite of wrinkles this thy golden time.
 But if thou live, remember'd not to be,
 Die single, and thine image dies with thee.
 Unthrifty loveliness, why dost thou spend
 Upon thyself thy beauty's legacy?
 Nature's bequest gives nothing but doth lend,
 And being frank she lends to those are free.
 Then, beauteous niggard, why dost thou abuse
 The bounteous largess given thee to give?
 Profitless usurer, why dost thou use
 So great a sum of sums, yet canst not live?
 For having traffic with thyself alone,
 Thou of thyself thy sweet self dost deceive.
 Then how, when nature calls thee to be gone,
 What acceptable audit canst thou leave?
 Thy unused beauty must be tomb'd with thee,
 Which, used, lives th' executor to be.
 Those hours, that with gentle work did frame
 The lovely gaze where every eye doth dwell,
 Will play the tyrants to the very same
 And that unfair which fairly doth excel:
 For never-resting time leads summer on
 To hideous winter and confounds him there;
 Sap cheque'd with frost and lusty leaves quite gone,
 Beauty o'ersnow'd and bareness every where:
 Then, were not summer's distillation left,
 A liquid prisoner pent in walls of glass,
 Beauty's effect with beauty were bereft,
 Nor it nor no remembrance what it was:
 But flowers distill'd though they with winter meet,
 Leese but their show; their substance still lives sweet.
 Then let not winter's ragged hand deface
 In thee thy summer, ere thou be distill'd:
 Make sweet some vial; treasure thou some place
 With beauty's treasure, ere it be self-kill'd.
 That use is not forbidden usury,
 Which happies those that pay the willing loan;
 That's for thyself to breed another thee,
 Or ten times happier, be it ten for one;
 Ten times thyself were happier than thou art,
 If ten of thine ten times refigured thee:
 Then what could death do, if thou shouldst depart,
 Leaving thee living in posterity?
 Be not self-will'd, for thou art much too fair
 To be death's conquest and make worms thine heir.
 Lo! in the orient when the gracious light
 Lifts up his burning head, each under eye
 Doth homage to his new-appearing sight,
 Serving with looks his sacred majesty;
 And having climb'd the steep-up heavenly hill,
 Resembling strong youth in his middle age,
 yet mortal looks adore his beauty still,
 Attending on his golden pilgrimage;
 But when from highmost pitch, with weary car,
 Like feeble age, he reeleth from the day,
 The eyes, 'fore duteous, now converted are
 From his low tract and look another way:
 So thou, thyself out-going in thy noon,
 Unlook'd on diest, unless thou get a son.
 Music to hear, why hear'st thou music sadly?
 Sweets with sweets war not, joy delights in joy.
 Why lovest thou that which thou receivest not gladly,
 Or else receivest with pleasure thine annoy?
 If the true concord of well-tuned sounds,
 By unions married, do offend thine ear,
 They do but sweetly chide thee, who confounds
 In singleness the parts that thou shouldst bear.
 Mark how one string, sweet husband to another,
 Strikes each in each by mutual ordering,
 Resembling sire and child and happy mother
 Who all in one, one pleasing note do sing:
 Whose speechless song, being many, seeming one,
 Sings this to thee: 'thou single wilt prove none.'
 Is it for fear to wet a widow's eye
 That thou consumest thyself in single life?
 Ah! if thou issueless shalt hap to die.
 The world will wail thee, like a makeless wife;
 The world will be thy widow and still weep
 That thou no form of thee hast left behind,
 When every private widow well may keep
 By children's eyes her husband's shape in mind.
 Look, what an unthrift in the world doth spend
 Shifts but his place, for still the world enjoys it;
 But beauty's waste hath in the world an end,
 And kept unused, the user so destroys it.
 No love toward others in that bosom sits
 That on himself such murderous shame commits.
 For shame! deny that thou bear'st love to any,
 Who for thyself art so unprovident.
 Grant, if thou wilt, thou art beloved of many,
 But that thou none lovest is most evident;
 For thou art so possess'd with murderous hate
 That 'gainst thyself thou stick'st not to conspire.
 Seeking that beauteous roof to ruinate
 Which to repair should be thy chief desire.
 O, change thy thought, that I may change my mind!
 Shall hate be fairer lodged than gentle love?
 Be, as thy presence is, gracious and kind,
 Or to thyself at least kind-hearted prove:
 Make thee another self, for love of me,
 That beauty still may live in thine or thee.
 As fast as thou shalt wane, so fast thou growest
 In one of thine, from that which thou departest;
 And that fresh blood which youngly thou bestowest
 Thou mayst call thine when thou from youth convertest.
 Herein lives wisdom, beauty and increase:
 Without this, folly, age and cold decay:
 If all were minded so, the times should cease
 And threescore year would make the world away.
 Let those whom Nature hath not made for store,
 Harsh featureless and rude, barrenly perish:
 Look, whom she best endow'd she gave the more;
 Which bounteous gift thou shouldst in bounty cherish:
 She carved thee for her seal, and meant thereby
 Thou shouldst print more, not let that copy die.
 When I do count the clock that tells the time,
 And see the brave day sunk in hideous night;
 When I behold the violet past prime,
 And sable curls all silver'd o'er with white;
 When lofty trees I see barren of leaves
 Which erst from heat did canopy the herd,
 And summer's green all girded up in sheaves
 Borne on the bier with white and bristly beard,
 Then of thy beauty do I question make,
 That thou among the wastes of time must go,
 Since sweets and beauties do themselves forsake
 And die as fast as they see others grow;
 And nothing 'gainst Time's scythe can make defence
 Save breed, to brave him when he takes thee hence.
 O, that you were yourself! but, love, you are
 No longer yours than you yourself here live:
 Against this coming end you should prepare,
 And your sweet semblance to some other give.
 So should that beauty which you hold in lease
 Find no determination: then you were
 Yourself again after yourself's decease,
 When your sweet issue your sweet form should bear.
 Who lets so fair a house fall to decay,
 Which husbandry in honour might uphold
 Against the stormy gusts of winter's day
 And barren rage of death's eternal cold?
 O, none but unthrifts! Dear my love, you know
 You had a father: let your son say so.
 Not from the stars do I my judgment pluck;
 And yet methinks I have astronomy,
 But not to tell of good or evil luck,
 Of plagues, of dearths, or seasons' quality;
 Nor can I fortune to brief minutes tell,
 Pointing to each his thunder, rain and wind,
 Or say with princes if it shall go well,
 By oft predict that I in heaven find:
 But from thine eyes my knowledge I derive,
 And, constant stars, in them I read such art
 As truth and beauty shall together thrive,
 If from thyself to store thou wouldst convert;
 Or else of thee this I prognosticate:
 Thy end is truth's and beauty's doom and date.
 When I consider every thing that grows
 Holds in perfection but a little moment,
 That this huge stage presenteth nought but shows
 Whereon the stars in secret influence comment;
 When I perceive that men as plants increase,
 Cheered and cheque'd even by the self-same sky,
 Vaunt in their youthful sap, at height decrease,
 And wear their brave state out of memory;
 Then the conceit of this inconstant stay
 Sets you most rich in youth before my sight,
 Where wasteful Time debateth with Decay,
 To change your day of youth to sullied night;
 And all in war with Time for love of you,
 As he takes from you, I engraft you new.
 But wherefore do not you a mightier way
 Make war upon this bloody tyrant, Time?
 And fortify yourself in your decay
 With means more blessed than my barren rhyme?
 Now stand you on the top of happy hours,
 And many maiden gardens yet unset
 With virtuous wish would bear your living flowers,
 Much liker than your painted counterfeit:
 So should the lines of life that life repair,
 Which this, Time's pencil, or my pupil pen,
 Neither in inward worth nor outward fair,
 Can make you live yourself in eyes of men.
 To give away yourself keeps yourself still,
 And you must live, drawn by your own sweet skill.
 Who will believe my verse in time to come,
 If it were fill'd with your most high deserts?
 Though yet, heaven knows, it is but as a tomb
 Which hides your life and shows not half your parts.
 If I could write the beauty of your eyes
 And in fresh numbers number all your graces,
 The age to come would say 'This poet lies:
 Such heavenly touches ne'er touch'd earthly faces.'
 So should my papers yellow'd with their age
 Be scorn'd like old men of less truth than tongue,
 And your true rights be term'd a poet's rage
 And stretched metre of an antique song:
 But were some child of yours alive that time,
 You should live twice; in it and in my rhyme.
 Shall I compare thee to a summer's day?
 Thou art more lovely and more temperate:
 Rough winds do shake the darling buds of May,
 And summer's lease hath all too short a date:
 Sometime too hot the eye of heaven shines,
 And often is his gold complexion dimm'd;
 And every fair from fair sometime declines,
 By chance or nature's changing course untrimm'd;
 But thy eternal summer shall not fade
 Nor lose possession of that fair thou owest;
 Nor shall Death brag thou wander'st in his shade,
 When in eternal lines to time thou growest:
 So long as men can breathe or eyes can see,
 So long lives this and this gives life to thee.
 Devouring Time, blunt thou the lion's paws,
 And make the earth devour her own sweet brood;
 Pluck the keen teeth from the fierce tiger's jaws,
 And burn the long-lived phoenix in her blood;
 Make glad and sorry seasons as thou fleets,
 And do whate'er thou wilt, swift-footed Time,
 To the wide world and all her fading sweets;
 But I forbid thee one most heinous crime:
 O, carve not with thy hours my love's fair brow,
 Nor draw no lines there with thine antique pen;
 Him in thy course untainted do allow
 For beauty's pattern to succeeding men.
 Yet, do thy worst, old Time: despite thy wrong,
 My love shall in my verse ever live young.
 A woman's face with Nature's own hand painted
 Hast thou, the master-mistress of my passion;
 A woman's gentle heart, but not acquainted
 With shifting change, as is false women's fashion;
 An eye more bright than theirs, less false in rolling,
 Gilding the object whereupon it gazeth;
 A man in hue, all 'hues' in his controlling,
 Much steals men's eyes and women's souls amazeth.
 And for a woman wert thou first created;
 Till Nature, as she wrought thee, fell a-doting,
 And by addition me of thee defeated,
 By adding one thing to my purpose nothing.
 But since she prick'd thee out for women's pleasure,
 Mine be thy love and thy love's use their treasure.
 So is it not with me as with that Muse
 Stirr'd by a painted beauty to his verse,
 Who heaven itself for ornament doth use
 And every fair with his fair doth rehearse
 Making a couplement of proud compare,
 With sun and moon, with earth and sea's rich gems,
 With April's first-born flowers, and all things rare
 That heaven's air in this huge rondure hems.
 O' let me, true in love, but truly write,
 And then believe me, my love is as fair
 As any mother's child, though not so bright
 As those gold candles fix'd in heaven's air:
 Let them say more than like of hearsay well;
 I will not praise that purpose not to sell.
 My glass shall not persuade me I am old,
 So long as youth and thou are of one date;
 But when in thee time's furrows I behold,
 Then look I death my days should expiate.
 For all that beauty that doth cover thee
 Is but the seemly raiment of my heart,
 Which in thy breast doth live, as thine in me:
 How can I then be elder than thou art?
 O, therefore, love, be of thyself so wary
 As I, not for myself, but for thee will;
 Bearing thy heart, which I will keep so chary
 As tender nurse her babe from faring ill.
 Presume not on thy heart when mine is slain;
 Thou gavest me thine, not to give back again.
 As an unperfect actor on the stage
 Who with his fear is put besides his part,
 Or some fierce thing replete with too much rage,
 Whose strength's abundance weakens his own heart.
 So I, for fear of trust, forget to say
 The perfect ceremony of love's rite,
 And in mine own love's strength seem to decay,
 O'ercharged with burden of mine own love's might.
 O, let my books be then the eloquence
 And dumb presagers of my speaking breast,
 Who plead for love and look for recompense
 More than that tongue that more hath more express'd.
 O, learn to read what silent love hath writ:
 To hear with eyes belongs to love's fine wit.
 Mine eye hath play'd the painter and hath stell'd
 Thy beauty's form in table of my heart;
 My body is the frame wherein 'tis held,
 And perspective it is the painter's art.
 For through the painter must you see his skill,
 To find where your true image pictured lies;
 Which in my bosom's shop is hanging still,
 That hath his windows glazed with thine eyes.
 Now see what good turns eyes for eyes have done:
 Mine eyes have drawn thy shape, and thine for me
 Are windows to my breast, where-through the sun
 Delights to peep, to gaze therein on thee;
 Yet eyes this cunning want to grace their art;
 They draw but what they see, know not the heart.
 Let those who are in favour with their stars
 Of public honour and proud titles boast,
 Whilst I, whom fortune of such triumph bars,
 Unlook'd for joy in that I honour most.
 Great princes' favourites their fair leaves spread
 But as the marigold at the sun's eye,
 And in themselves their pride lies buried,
 For at a frown they in their glory die.
 The painful warrior famoused for fight,
 After a thousand victories once foil'd,
 Is from the book of honour razed quite,
 And all the rest forgot for which he toil'd:
 Then happy I, that love and am beloved
 Where I may not remove nor be removed.
 Lord of my love, to whom in vassalage
 Thy merit hath my duty strongly knit,
 To thee I send this written embassage,
 To witness duty, not to show my wit:
 Duty so great, which wit so poor as mine
 May make seem bare, in wanting words to show it,
 But that I hope some good conceit of thine
 In thy soul's thought, all naked, will bestow it;
 Till whatsoever star that guides my moving
 Points on me graciously with fair aspect
 And puts apparel on my tatter'd loving,
 To show me worthy of thy sweet respect:
 Then may I dare to boast how I do love thee;
 Till then not show my head where thou mayst prove me.
 Weary with toil, I haste me to my bed,
 The dear repose for limbs with travel tired;
 But then begins a journey in my head,
 To work my mind, when body's work's expired:
 For then my thoughts, from far where I abide,
 Intend a zealous pilgrimage to thee,
 And keep my drooping eyelids open wide,
 Looking on darkness which the blind do see
 Save that my soul's imaginary sight
 Presents thy shadow to my sightless view,
 Which, like a jewel hung in ghastly night,
 Makes black night beauteous and her old face new.
 Lo! thus, by day my limbs, by night my mind,
 For thee and for myself no quiet find.
 How can I then return in happy plight,
 That am debarr'd the benefit of rest?
 When day's oppression is not eased by night,
 But day by night, and night by day, oppress'd?
 And each, though enemies to either's reign,
 Do in consent shake hands to torture me;
 The one by toil, the other to complain
 How far I toil, still farther off from thee.
 I tell the day, to please them thou art bright
 And dost him grace when clouds do blot the heaven:
 So flatter I the swart-complexion'd night,
 When sparkling stars twire not thou gild'st the even.
 But day doth daily draw my sorrows longer
 And night doth nightly make grief's strength seem stronger.
 When, in disgrace with fortune and men's eyes,
 I all alone beweep my outcast state
 And trouble deal heaven with my bootless cries
 And look upon myself and curse my fate,
 Wishing me like to one more rich in hope,
 Featured like him, like him with friends possess'd,
 Desiring this man's art and that man's scope,
 With what I most enjoy contented least;
 Yet in these thoughts myself almost despising,
 Haply I think on thee, and then my state,
 Like to the lark at break of day arising
 From sullen earth, sings hymns at heaven's gate;
 For thy sweet love remember'd such wealth brings
 That then I scorn to change my state with kings.
 When to the sessions of sweet silent thought
 I summon up remembrance of things past,
 I sigh the lack of many a thing I sought,
 And with old woes new wail my dear time's waste:
 Then can I drown an eye, unused to flow,
 For precious friends hid in death's dateless night,
 And weep afresh love's long since cancell'd woe,
 And moan the expense of many a vanish'd sight:
 Then can I grieve at grievances foregone,
 And heavily from woe to woe tell o'er
 The sad account of fore-bemoaned moan,
 Which I new pay as if not paid before.
 But if the while I think on thee, dear friend,
 All losses are restored and sorrows end.
 Thy bosom is endeared with all hearts,
 Which I by lacking have supposed dead,
 And there reigns love and all love's loving parts,
 And all those friends which I thought buried.
 How many a holy and obsequious tear
 Hath dear religious love stol'n from mine eye
 As interest of the dead, which now appear
 But things removed that hidden in thee lie!
 Thou art the grave where buried love doth live,
 Hung with the trophies of my lovers gone,
 Who all their parts of me to thee did give;
 That due of many now is thine alone:
 Their images I loved I view in thee,
 And thou, all they, hast all the all of me.
 If thou survive my well-contented day,
 When that churl Death my bones with dust shall cover,
 And shalt by fortune once more re-survey
 These poor rude lines of thy deceased lover,
 Compare them with the bettering of the time,
 And though they be outstripp'd by every pen,
 Reserve them for my love, not for their rhyme,
 Exceeded by the height of happier men.
 O, then vouchsafe me but this loving thought:
 'Had my friend's Muse grown with this growing age,
 A dearer birth than this his love had brought,
 To march in ranks of better equipage:
 But since he died and poets better prove,
 Theirs for their style I'll read, his for his love.'
 Full many a glorious morning have I seen
 Flatter the mountain-tops with sovereign eye,
 Kissing with golden face the meadows green,
 Gilding pale streams with heavenly alchemy;
 Anon permit the basest clouds to ride
 With ugly rack on his celestial face,
 And from the forlorn world his visage hide,
 Stealing unseen to west with this disgrace:
 Even so my sun one early morn did shine
 With all triumphant splendor on my brow;
 But out, alack! he was but one hour mine;
 The region cloud hath mask'd him from me now.
 Yet him for this my love no whit disdaineth;
 Suns of the world may stain when heaven's sun staineth.
 Why didst thou promise such a beauteous day,
 And make me travel forth without my cloak,
 To let base clouds o'ertake me in my way,
 Hiding thy bravery in their rotten smoke?
 'Tis not enough that through the cloud thou break,
 To dry the rain on my storm-beaten face,
 For no man well of such a salve can speak
 That heals the wound and cures not the disgrace:
 Nor can thy shame give physic to my grief;
 Though thou repent, yet I have still the loss:
 The offender's sorrow lends but weak relief
 To him that bears the strong offence's cross.
 Ah! but those tears are pearl which thy love sheds,
 And they are rich and ransom all ill deeds.
 No more be grieved at that which thou hast done:
 Roses have thorns, and silver fountains mud;
 Clouds and eclipses stain both moon and sun,
 And loathsome canker lives in sweetest bud.
 All men make faults, and even I in this,
 Authorizing thy trespass with compare,
 Myself corrupting, salving thy amiss,
 Excusing thy sins more than thy sins are;
 For to thy sensual fault I bring in sense--
 Thy adverse party is thy advocate--
 And 'gainst myself a lawful plea commence:
 Such civil war is in my love and hate
 That I an accessary needs must be
 To that sweet thief which sourly robs from me.
 Let me confess that we two must be twain,
 Although our undivided loves are one:
 So shall those blots that do with me remain
 Without thy help by me be borne alone.
 In our two loves there is but one respect,
 Though in our lives a separable spite,
 Which though it alter not love's sole effect,
 Yet doth it steal sweet hours from love's delight.
 I may not evermore acknowledge thee,
 Lest my bewailed guilt should do thee shame,
 Nor thou with public kindness honour me,
 Unless thou take that honour from thy name:
 But do not so; I love thee in such sort
 As, thou being mine, mine is thy good report.
 As a decrepit father takes delight
 To see his active child do deeds of youth,
 So I, made lame by fortune's dearest spite,
 Take all my comfort of thy worth and truth.
 For whether beauty, birth, or wealth, or wit,
 Or any of these all, or all, or more,
 Entitled in thy parts do crowned sit,
 I make my love engrafted to this store:
 So then I am not lame, poor, nor despised,
 Whilst that this shadow doth such substance give
 That I in thy abundance am sufficed
 And by a part of all thy glory live.
 Look, what is best, that best I wish in thee:
 This wish I have; then ten times happy me!
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -0,0 +1,90 @@
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 #
 # Define environment variables for special configurations
 #
 if(DEFINED ENV{VLLM_CPU_AVX512BF16})
    set(ENABLE_AVX512BF16 ON)
 endif()
 include_directories("${CMAKE_SOURCE_DIR}/csrc")
 #
 # Check the compile flags
 #
 list(APPEND CXX_COMPILE_FLAGS 
    "-fopenmp"
    "-DVLLM_CPU_EXTENSION")
 execute_process(COMMAND cat /proc/cpuinfo
                RESULT_VARIABLE CPUINFO_RET
                OUTPUT_VARIABLE CPUINFO)
 if (NOT CPUINFO_RET EQUAL 0)
    message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
 endif()
 function (find_isa CPUINFO TARGET OUT)
    string(FIND ${CPUINFO} ${TARGET} ISA_FOUND)
    if(NOT ISA_FOUND EQUAL -1)
        set(${OUT} ON PARENT_SCOPE)
    else()
        set(${OUT} OFF PARENT_SCOPE)
    endif()
 endfunction()
 find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
 if (AVX512_FOUND)
    list(APPEND CXX_COMPILE_FLAGS
        "-mavx512f"
        "-mavx512vl"
        "-mavx512bw"
        "-mavx512dq")
    find_isa(${CPUINFO} "avx512_bf16" AVX512BF16_FOUND)
    if (AVX512BF16_FOUND OR ENABLE_AVX512BF16)
        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND 
            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3) 
            list(APPEND CXX_COMPILE_FLAGS "-mavx512bf16")
        else()
            message(WARNING "Disable AVX512-BF16 ISA support, requires gcc/g++ >= 12.3")
        endif()
    else()
        message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
    endif()
 else()
    message(FATAL_ERROR "vLLM CPU backend requires AVX512 ISA support.")
 endif()
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
 #
 # Define extension targets
 #
 #
 # _C extension
 #
 set(VLLM_EXT_SRC
    "csrc/cpu/activation.cpp"
    "csrc/cpu/attention.cpp"
    "csrc/cpu/cache.cpp"
    "csrc/cpu/layernorm.cpp"
    "csrc/cpu/pos_encoding.cpp"
    "csrc/cpu/pybind.cpp")
 define_gpu_extension_target(
    _C
    DESTINATION vllm
    LANGUAGE CXX
    SOURCES ${VLLM_EXT_SRC}
    COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
    WITH_SOABI 
 )
 add_custom_target(default)
 message(STATUS "Enabling C extension.")
 add_dependencies(default _C)
--- a/cmake/hipify.py
+++ b/cmake/hipify.py
@@ -0,0 +1,73 @@
 #!/usr/bin/env python3
 #
 # A command line tool for running pytorch's hipify preprocessor on CUDA
 # source files.
 #
 # See https://github.com/ROCm/hipify_torch
 # and <torch install dir>/utils/hipify/hipify_python.py
 #
 import argparse
 import os
 import shutil
 from torch.utils.hipify.hipify_python import hipify
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    # Project directory where all the source + include files live.
    parser.add_argument(
        "-p",
        "--project_dir",
        help="The project directory.",
    )
    # Directory where hipified files are written.
    parser.add_argument(
        "-o",
        "--output_dir",
        help="The output directory.",
    )
    # Source files to convert.
    parser.add_argument("sources",
                        help="Source files to hipify.",
                        nargs="*",
                        default=[])
    args = parser.parse_args()
    # Limit include scope to project_dir only
    includes = [os.path.join(args.project_dir, '*')]
    # Get absolute path for all source files.
    extra_files = [os.path.abspath(s) for s in args.sources]
    # Copy sources from project directory to output directory.
    # The directory might already exist to hold object files so we ignore that.
    shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
    hipify_result = hipify(project_directory=args.project_dir,
                           output_directory=args.output_dir,
                           header_include_dirs=[],
                           includes=includes,
                           extra_files=extra_files,
                           show_detailed=True,
                           is_pytorch_extension=True,
                           hipify_extra_files_only=True)
    hipified_sources = []
    for source in args.sources:
        s_abs = os.path.abspath(source)
        hipified_s_abs = (hipify_result[s_abs].hipified_path if
                          (s_abs in hipify_result
                           and hipify_result[s_abs].hipified_path is not None)
                          else s_abs)
        hipified_sources.append(hipified_s_abs)
    assert (len(hipified_sources) == len(args.sources))
    # Print hipified source files.
    print("\n".join(hipified_sources))
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -0,0 +1,354 @@
 #
 # Attempt to find the python package that uses the same python executable as
 # `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`.
 #
 macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
  file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
  set(Python_EXECUTABLE ${EXECUTABLE})
  find_package(Python COMPONENTS Interpreter Development.Module)
  if (NOT Python_FOUND)
    message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
  endif()
  set(_VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}")
  set(_SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN})
  if (NOT _VER IN_LIST _SUPPORTED_VERSIONS_LIST)
    message(FATAL_ERROR
      "Python version (${_VER}) is not one of the supported versions: "
      "${_SUPPORTED_VERSIONS_LIST}.")
  endif()
  message(STATUS "Found python matching: ${EXECUTABLE}.")
 endmacro()
 #
 # Run `EXPR` in python.  The standard output of python is stored in `OUT` and
 # has trailing whitespace stripped.  If an error is encountered when running
 # python, a fatal message `ERR_MSG` is issued.
 #
 function (run_python OUT EXPR ERR_MSG)
  execute_process(
    COMMAND
    "${Python_EXECUTABLE}" "-c" "${EXPR}"
    OUTPUT_VARIABLE PYTHON_OUT
    RESULT_VARIABLE PYTHON_ERROR_CODE
    ERROR_VARIABLE PYTHON_STDERR
    OUTPUT_STRIP_TRAILING_WHITESPACE)
  if(NOT PYTHON_ERROR_CODE EQUAL 0)
    message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}")
  endif()
  set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
 endfunction()
 # Run `EXPR` in python after importing `PKG`. Use the result of this to extend
 # `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
 macro (append_cmake_prefix_path PKG EXPR)
  run_python(_PREFIX_PATH
    "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path")
  list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH})
 endmacro()
 #
 # Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set
 # of CUDA source files. The names of the corresponding "hipified" sources are
 # stored in `OUT_SRCS`.
 #
 function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
  #
  # Split into C++ and non-C++ (i.e. CUDA) sources.
  #
  set(SRCS ${ORIG_SRCS})
  set(CXX_SRCS ${ORIG_SRCS})
  list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$")
  list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$")
  #
  # Generate ROCm/HIP source file names from CUDA file names.
  # Since HIP files are generated code, they will appear in the build area
  # `CMAKE_CURRENT_BINARY_DIR` directory rather than the original csrc dir.
  #
  set(HIP_SRCS)
  foreach (SRC ${SRCS})
    string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC})
    string(REGEX REPLACE "cuda" "hip" SRC ${SRC})
    list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}")
  endforeach()
  set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
  add_custom_target(
    hipify${NAME}
    COMMAND ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
    DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
    BYPRODUCTS ${HIP_SRCS}
    COMMENT "Running hipify on ${NAME} extension source files.")
  # Swap out original extension sources with hipified sources.
  list(APPEND HIP_SRCS ${CXX_SRCS})
  set(${OUT_SRCS} ${HIP_SRCS} PARENT_SCOPE)
 endfunction()
 #
 # Get additional GPU compiler flags from torch.
 #
 function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
  if (${GPU_LANG} STREQUAL "CUDA")
    #
    # Get common NVCC flags from torch.
    #
    run_python(GPU_FLAGS
      "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))"
      "Failed to determine torch nvcc compiler flags")
    if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
      list(APPEND GPU_FLAGS "-DENABLE_FP8_E5M2")
    endif()
    if (CUDA_VERSION VERSION_GREATER_EQUAL 12.0)
      list(REMOVE_ITEM GPU_FLAGS
        "-D__CUDA_NO_HALF_OPERATORS__"
        "-D__CUDA_NO_HALF_CONVERSIONS__"
        "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
        "-D__CUDA_NO_HALF2_OPERATORS__")
    endif()
  elseif(${GPU_LANG} STREQUAL "HIP")
    #
    # Get common HIP/HIPCC flags from torch.
    #
    run_python(GPU_FLAGS
      "import torch.utils.cpp_extension as t; print(';'.join(t.COMMON_HIP_FLAGS + t.COMMON_HIPCC_FLAGS))"
      "Failed to determine torch nvcc compiler flags")
    list(APPEND GPU_FLAGS
      "-DUSE_ROCM"
      "-DENABLE_FP8_E4M3"
      "-U__HIP_NO_HALF_CONVERSIONS__"
      "-U__HIP_NO_HALF_OPERATORS__"
      "-fno-gpu-rdc")
  endif()
  set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE)
 endfunction()
 # Macro for converting a `gencode` version number to a cmake version number.
 macro(string_to_ver OUT_VER IN_STR)
  string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
 endmacro()
 #
 # Override the GPU architectures detected by cmake/torch and filter them by
 # `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
 # `GPU_ARCHES`.
 #
 # Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
 #
 macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
  set(_GPU_SUPPORTED_ARCHES_LIST ${GPU_SUPPORTED_ARCHES} ${ARGN})
  message(STATUS "${GPU_LANG} supported arches: ${_GPU_SUPPORTED_ARCHES_LIST}")
  if (${GPU_LANG} STREQUAL "HIP")
    #
    # `GPU_ARCHES` controls the `--offload-arch` flags.
    # `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled
    # via the `PYTORCH_ROCM_ARCH` env variable.
    #
    #
    # Find the intersection of the supported + detected architectures to
    # set the module architecture flags.
    #
    set(${GPU_ARCHES})
    foreach (_ARCH ${CMAKE_HIP_ARCHITECTURES})
      if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
        list(APPEND ${GPU_ARCHES} ${_ARCH})
      endif()
    endforeach()
    if(NOT ${GPU_ARCHES})
      message(FATAL_ERROR
        "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is"
        " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
    endif()
  elseif(${GPU_LANG} STREQUAL "CUDA")
    #
    # Setup/process CUDA arch flags.
    #
    # The torch cmake setup hardcodes the detected architecture flags in
    # `CMAKE_CUDA_FLAGS`.  Since `CMAKE_CUDA_FLAGS` is a "global" variable, it
    # can't modified on a per-target basis, e.g. for the `punica` extension.
    # So, all the `-gencode` flags need to be extracted and removed from
    # `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method.
    # Since it's not possible to use `target_compiler_options` for adding target
    # specific `-gencode` arguments, the target's `CUDA_ARCHITECTURES` property
    # must be used instead.  This requires repackaging the architecture flags
    # into a format that cmake expects for `CUDA_ARCHITECTURES`.
    #
    # This is a bit fragile in that it depends on torch using `-gencode` as opposed
    # to one of the other nvcc options to specify architectures.
    #
    # Note: torch uses the `TORCH_CUDA_ARCH_LIST` environment variable to override
    # detected architectures.
    #
    message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
    # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
    string(REGEX MATCHALL "-gencode arch=[^ ]+" _CUDA_ARCH_FLAGS
      ${CMAKE_CUDA_FLAGS})
    # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
    # and passed back via the `CUDA_ARCHITECTURES` property.
    string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
      ${CMAKE_CUDA_FLAGS})
    # If this error is triggered, it might mean that torch has changed how it sets
    # up nvcc architecture code generation flags.
    if (NOT _CUDA_ARCH_FLAGS)
      message(FATAL_ERROR
        "Could not find any architecture related code generation flags in "
        "CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})")
    endif()
    message(DEBUG "final CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
    message(DEBUG "arch flags: ${_CUDA_ARCH_FLAGS}")
    # Initialize the architecture lists to empty.
    set(${GPU_ARCHES})
    # Process each `gencode` flag.
    foreach(_ARCH ${_CUDA_ARCH_FLAGS})
      # For each flag, extract the version number and whether it refers to PTX
      # or native code.
      # Note: if a regex matches then `CMAKE_MATCH_1` holds the binding
      # for that match.
      string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
      if (_COMPUTE)
        set(_COMPUTE ${CMAKE_MATCH_1})
      endif()
      string(REGEX MATCH "code=sm_\([0-9]+a?\)" _SM ${_ARCH})
      if (_SM)
        set(_SM ${CMAKE_MATCH_1})
      endif()
      string(REGEX MATCH "code=compute_\([0-9]+a?\)" _CODE ${_ARCH})
      if (_CODE)
        set(_CODE ${CMAKE_MATCH_1})
      endif()
      # Make sure the virtual architecture can be matched.
      if (NOT _COMPUTE)
        message(FATAL_ERROR
          "Could not determine virtual architecture from: ${_ARCH}.")
      endif()
      # One of sm_ or compute_ must exist.
      if ((NOT _SM) AND (NOT _CODE))
        message(FATAL_ERROR
          "Could not determine a codegen architecture from: ${_ARCH}.")
      endif()
      if (_SM)
        # -real suffix let CMake to only generate elf code for the kernels.
        # we want this, otherwise the added ptx (default) will increase binary size.
        set(_VIRT "-real")
        set(_CODE_ARCH ${_SM})
      else()
        # -virtual suffix let CMake to generate ptx code for the kernels.
        set(_VIRT "-virtual")
        set(_CODE_ARCH ${_CODE})
      endif()
      # Check if the current version is in the supported arch list.
      string_to_ver(_CODE_VER ${_CODE_ARCH})
      if (NOT _CODE_VER IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
        message(STATUS "discarding unsupported CUDA arch ${_VER}.")
        continue()
      endif()
      # Add it to the arch list.
      list(APPEND ${GPU_ARCHES} "${_CODE_ARCH}${_VIRT}")
    endforeach()
  endif()
  message(STATUS "${GPU_LANG} target arches: ${${GPU_ARCHES}}")
 endmacro()
 #
 # Define a target named `GPU_MOD_NAME` for a single extension. The
 # arguments are:
 #
 # DESTINATION <dest>         - Module destination directory.
 # LANGUAGE <lang>            - The GPU language for this module, e.g CUDA, HIP,
 #                              etc.
 # SOURCES <sources>          - List of source files relative to CMakeLists.txt
 #                              directory.
 #
 # Optional arguments:
 #
 # ARCHITECTURES <arches>     - A list of target GPU architectures in cmake
 #                              format.
 #                              Refer `CMAKE_CUDA_ARCHITECTURES` documentation
 #                              and `CMAKE_HIP_ARCHITECTURES` for more info.
 #                              ARCHITECTURES will use cmake's defaults if
 #                              not provided.
 # COMPILE_FLAGS <flags>      - Extra compiler flags passed to NVCC/hip.
 # INCLUDE_DIRECTORIES <dirs> - Extra include directories.
 # LIBRARIES <libraries>      - Extra link libraries.
 # WITH_SOABI                 - Generate library with python SOABI suffix name.
 #
 # Note: optimization level/debug info is set via cmake build type.
 #
 function (define_gpu_extension_target GPU_MOD_NAME)
  cmake_parse_arguments(PARSE_ARGV 1
    GPU
    "WITH_SOABI"
    "DESTINATION;LANGUAGE"
    "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
  # Add hipify preprocessing step when building with HIP/ROCm.
  if (GPU_LANGUAGE STREQUAL "HIP")
    hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}")
  endif()
  if (GPU_WITH_SOABI)
    set(GPU_WITH_SOABI WITH_SOABI)
  else()
    set(GPU_WITH_SOABI)
  endif()
  Python_add_library(${GPU_MOD_NAME} MODULE "${GPU_SOURCES}" ${GPU_WITH_SOABI})
  if (GPU_LANGUAGE STREQUAL "HIP")
    # Make this target dependent on the hipify preprocessor step.
    add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME})
  endif()
  if (GPU_ARCHITECTURES)
    set_target_properties(${GPU_MOD_NAME} PROPERTIES
      ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
  endif()
  set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17)
  target_compile_options(${GPU_MOD_NAME} PRIVATE
    $<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>)
  target_compile_definitions(${GPU_MOD_NAME} PRIVATE
    "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}")
  target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
    ${GPU_INCLUDE_DIRECTORIES})
  target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${torch_python_LIBRARY}
    ${GPU_LIBRARIES})
  # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
  # dependencies that are not necessary and may not be installed.
  if (GPU_LANGUAGE STREQUAL "CUDA")
    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB}
      ${CUDA_LIBRARIES})
  else()
    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
  endif()
  install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION})
 endfunction()
--- a/collect_env.py
+++ b/collect_env.py
@@ -0,0 +1,721 @@
 # ruff: noqa
 # code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py
 # Unlike the rest of the PyTorch this file must be python2 compliant.
 # This script outputs relevant system environment info
 # Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
 import datetime
 import locale
 import os
 import re
 import subprocess
 import sys
 from collections import namedtuple
 try:
    import torch
    TORCH_AVAILABLE = True
 except (ImportError, NameError, AttributeError, OSError):
    TORCH_AVAILABLE = False
 # System Environment Information
 SystemEnv = namedtuple(
    'SystemEnv',
    [
        'torch_version',
        'is_debug_build',
        'cuda_compiled_version',
        'gcc_version',
        'clang_version',
        'cmake_version',
        'os',
        'libc_version',
        'python_version',
        'python_platform',
        'is_cuda_available',
        'cuda_runtime_version',
        'cuda_module_loading',
        'nvidia_driver_version',
        'nvidia_gpu_models',
        'cudnn_version',
        'pip_version',  # 'pip' or 'pip3'
        'pip_packages',
        'conda_packages',
        'hip_compiled_version',
        'hip_runtime_version',
        'miopen_runtime_version',
        'caching_allocator_config',
        'is_xnnpack_available',
        'cpu_info',
        'rocm_version',  # vllm specific field
        'neuron_sdk_version',  # vllm specific field
        'vllm_version',  # vllm specific field
        'vllm_build_flags',  # vllm specific field
        'gpu_topo',  # vllm specific field
    ])
 DEFAULT_CONDA_PATTERNS = {
    "torch",
    "numpy",
    "cudatoolkit",
    "soumith",
    "mkl",
    "magma",
    "triton",
    "optree",
    "nccl",
 }
 DEFAULT_PIP_PATTERNS = {
    "torch",
    "numpy",
    "mypy",
    "flake8",
    "triton",
    "optree",
    "onnx",
    "nccl",
 }
 def run(command):
    """Return (return-code, stdout, stderr)."""
    shell = True if type(command) is str else False
    p = subprocess.Popen(command,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE,
                         shell=shell)
    raw_output, raw_err = p.communicate()
    rc = p.returncode
    if get_platform() == 'win32':
        enc = 'oem'
    else:
        enc = locale.getpreferredencoding()
    output = raw_output.decode(enc)
    err = raw_err.decode(enc)
    return rc, output.strip(), err.strip()
 def run_and_read_all(run_lambda, command):
    """Run command using run_lambda; reads and returns entire output if rc is 0."""
    rc, out, _ = run_lambda(command)
    if rc != 0:
        return None
    return out
 def run_and_parse_first_match(run_lambda, command, regex):
    """Run command using run_lambda, returns the first regex match if it exists."""
    rc, out, _ = run_lambda(command)
    if rc != 0:
        return None
    match = re.search(regex, out)
    if match is None:
        return None
    return match.group(1)
 def run_and_return_first_line(run_lambda, command):
    """Run command using run_lambda and returns first line if output is not empty."""
    rc, out, _ = run_lambda(command)
    if rc != 0:
        return None
    return out.split('\n')[0]
 def get_conda_packages(run_lambda, patterns=None):
    if patterns is None:
        patterns = DEFAULT_CONDA_PATTERNS
    conda = os.environ.get('CONDA_EXE', 'conda')
    out = run_and_read_all(run_lambda, "{} list".format(conda))
    if out is None:
        return out
    return "\n".join(line for line in out.splitlines()
                     if not line.startswith("#") and any(name in line
                                                         for name in patterns))
 def get_gcc_version(run_lambda):
    return run_and_parse_first_match(run_lambda, 'gcc --version', r'gcc (.*)')
 def get_clang_version(run_lambda):
    return run_and_parse_first_match(run_lambda, 'clang --version',
                                     r'clang version (.*)')
 def get_cmake_version(run_lambda):
    return run_and_parse_first_match(run_lambda, 'cmake --version',
                                     r'cmake (.*)')
 def get_nvidia_driver_version(run_lambda):
    if get_platform() == 'darwin':
        cmd = 'kextstat | grep -i cuda'
        return run_and_parse_first_match(run_lambda, cmd,
                                         r'com[.]nvidia[.]CUDA [(](.*?)[)]')
    smi = get_nvidia_smi()
    return run_and_parse_first_match(run_lambda, smi,
                                     r'Driver Version: (.*?) ')
 def get_gpu_info(run_lambda):
    if get_platform() == 'darwin' or (TORCH_AVAILABLE and hasattr(
            torch.version, 'hip') and torch.version.hip is not None):
        if TORCH_AVAILABLE and torch.cuda.is_available():
            if torch.version.hip is not None:
                prop = torch.cuda.get_device_properties(0)
                if hasattr(prop, "gcnArchName"):
                    gcnArch = " ({})".format(prop.gcnArchName)
                else:
                    gcnArch = "NoGCNArchNameOnOldPyTorch"
            else:
                gcnArch = ""
            return torch.cuda.get_device_name(None) + gcnArch
        return None
    smi = get_nvidia_smi()
    uuid_regex = re.compile(r' \(UUID: .+?\)')
    rc, out, _ = run_lambda(smi + ' -L')
    if rc != 0:
        return None
    # Anonymize GPUs by removing their UUID
    return re.sub(uuid_regex, '', out)
 def get_running_cuda_version(run_lambda):
    return run_and_parse_first_match(run_lambda, 'nvcc --version',
                                     r'release .+ V(.*)')
 def get_cudnn_version(run_lambda):
    """Return a list of libcudnn.so; it's hard to tell which one is being used."""
    if get_platform() == 'win32':
        system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
        cuda_path = os.environ.get('CUDA_PATH', "%CUDA_PATH%")
        where_cmd = os.path.join(system_root, 'System32', 'where')
        cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path)
    elif get_platform() == 'darwin':
        # CUDA libraries and drivers can be found in /usr/local/cuda/. See
        # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install
        # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac
        # Use CUDNN_LIBRARY when cudnn library is installed elsewhere.
        cudnn_cmd = 'ls /usr/local/cuda/lib/libcudnn*'
    else:
        cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev'
    rc, out, _ = run_lambda(cudnn_cmd)
    # find will return 1 if there are permission errors or if not found
    if len(out) == 0 or (rc != 1 and rc != 0):
        l = os.environ.get('CUDNN_LIBRARY')
        if l is not None and os.path.isfile(l):
            return os.path.realpath(l)
        return None
    files_set = set()
    for fn in out.split('\n'):
        fn = os.path.realpath(fn)  # eliminate symbolic links
        if os.path.isfile(fn):
            files_set.add(fn)
    if not files_set:
        return None
    # Alphabetize the result because the order is non-deterministic otherwise
    files = sorted(files_set)
    if len(files) == 1:
        return files[0]
    result = '\n'.join(files)
    return 'Probably one of the following:\n{}'.format(result)
 def get_nvidia_smi():
    # Note: nvidia-smi is currently available only on Windows and Linux
    smi = 'nvidia-smi'
    if get_platform() == 'win32':
        system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
        program_files_root = os.environ.get('PROGRAMFILES',
                                            'C:\\Program Files')
        legacy_path = os.path.join(program_files_root, 'NVIDIA Corporation',
                                   'NVSMI', smi)
        new_path = os.path.join(system_root, 'System32', smi)
        smis = [new_path, legacy_path]
        for candidate_smi in smis:
            if os.path.exists(candidate_smi):
                smi = '"{}"'.format(candidate_smi)
                break
    return smi
 def get_rocm_version(run_lambda):
    """Returns the ROCm version if available, otherwise 'N/A'."""
    return run_and_parse_first_match(run_lambda, 'hipcc --version',
                                     r'HIP version: (\S+)')
 def get_neuron_sdk_version(run_lambda):
    # Adapted from your install script
    try:
        result = run_lambda(["neuron-ls"])
        return result if result[0] == 0 else 'N/A'
    except Exception:
        return 'N/A'
 def get_vllm_version():
    try:
        import vllm
        return vllm.__version__
    except ImportError:
        return 'N/A'
 def summarize_vllm_build_flags():
    # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
    return 'CUDA Archs: {}; ROCm: {}; Neuron: {}'.format(
        os.environ.get('TORCH_CUDA_ARCH_LIST', 'Not Set'),
        'Enabled' if os.environ.get('ROCM_HOME') else 'Disabled',
        'Enabled' if os.environ.get('NEURON_CORES') else 'Disabled',
    )
 def get_gpu_topo(run_lambda):
    if get_platform() == 'linux':
        return run_and_read_all(run_lambda, 'nvidia-smi topo -m')
    return None
 # example outputs of CPU infos
 #  * linux
 #    Architecture:            x86_64
 #      CPU op-mode(s):        32-bit, 64-bit
 #      Address sizes:         46 bits physical, 48 bits virtual
 #      Byte Order:            Little Endian
 #    CPU(s):                  128
 #      On-line CPU(s) list:   0-127
 #    Vendor ID:               GenuineIntel
 #      Model name:            Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
 #        CPU family:          6
 #        Model:               106
 #        Thread(s) per core:  2
 #        Core(s) per socket:  32
 #        Socket(s):           2
 #        Stepping:            6
 #        BogoMIPS:            5799.78
 #        Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr
 #                             sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl
 #                             xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq monitor ssse3 fma cx16
 #                             pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand
 #                             hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced
 #                             fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap
 #                             avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1
 #                             xsaves wbnoinvd ida arat avx512vbmi pku ospke avx512_vbmi2 gfni vaes vpclmulqdq
 #                             avx512_vnni avx512_bitalg tme avx512_vpopcntdq rdpid md_clear flush_l1d arch_capabilities
 #    Virtualization features:
 #      Hypervisor vendor:     KVM
 #      Virtualization type:   full
 #    Caches (sum of all):
 #      L1d:                   3 MiB (64 instances)
 #      L1i:                   2 MiB (64 instances)
 #      L2:                    80 MiB (64 instances)
 #      L3:                    108 MiB (2 instances)
 #    NUMA:
 #      NUMA node(s):          2
 #      NUMA node0 CPU(s):     0-31,64-95
 #      NUMA node1 CPU(s):     32-63,96-127
 #    Vulnerabilities:
 #      Itlb multihit:         Not affected
 #      L1tf:                  Not affected
 #      Mds:                   Not affected
 #      Meltdown:              Not affected
 #      Mmio stale data:       Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown
 #      Retbleed:              Not affected
 #      Spec store bypass:     Mitigation; Speculative Store Bypass disabled via prctl and seccomp
 #      Spectre v1:            Mitigation; usercopy/swapgs barriers and __user pointer sanitization
 #      Spectre v2:            Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence
 #      Srbds:                 Not affected
 #      Tsx async abort:       Not affected
 #  * win32
 #    Architecture=9
 #    CurrentClockSpeed=2900
 #    DeviceID=CPU0
 #    Family=179
 #    L2CacheSize=40960
 #    L2CacheSpeed=
 #    Manufacturer=GenuineIntel
 #    MaxClockSpeed=2900
 #    Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
 #    ProcessorType=3
 #    Revision=27142
 #
 #    Architecture=9
 #    CurrentClockSpeed=2900
 #    DeviceID=CPU1
 #    Family=179
 #    L2CacheSize=40960
 #    L2CacheSpeed=
 #    Manufacturer=GenuineIntel
 #    MaxClockSpeed=2900
 #    Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
 #    ProcessorType=3
 #    Revision=27142
 def get_cpu_info(run_lambda):
    rc, out, err = 0, '', ''
    if get_platform() == 'linux':
        rc, out, err = run_lambda('lscpu')
    elif get_platform() == 'win32':
        rc, out, err = run_lambda(
            'wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \
        CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE'
        )
    elif get_platform() == 'darwin':
        rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string")
    cpu_info = 'None'
    if rc == 0:
        cpu_info = out
    else:
        cpu_info = err
    return cpu_info
 def get_platform():
    if sys.platform.startswith('linux'):
        return 'linux'
    elif sys.platform.startswith('win32'):
        return 'win32'
    elif sys.platform.startswith('cygwin'):
        return 'cygwin'
    elif sys.platform.startswith('darwin'):
        return 'darwin'
    else:
        return sys.platform
 def get_mac_version(run_lambda):
    return run_and_parse_first_match(run_lambda, 'sw_vers -productVersion',
                                     r'(.*)')
 def get_windows_version(run_lambda):
    system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
    wmic_cmd = os.path.join(system_root, 'System32', 'Wbem', 'wmic')
    findstr_cmd = os.path.join(system_root, 'System32', 'findstr')
    return run_and_read_all(
        run_lambda,
        '{} os get Caption | {} /v Caption'.format(wmic_cmd, findstr_cmd))
 def get_lsb_version(run_lambda):
    return run_and_parse_first_match(run_lambda, 'lsb_release -a',
                                     r'Description:\t(.*)')
 def check_release_file(run_lambda):
    return run_and_parse_first_match(run_lambda, 'cat /etc/*-release',
                                     r'PRETTY_NAME="(.*)"')
 def get_os(run_lambda):
    from platform import machine
    platform = get_platform()
    if platform == 'win32' or platform == 'cygwin':
        return get_windows_version(run_lambda)
    if platform == 'darwin':
        version = get_mac_version(run_lambda)
        if version is None:
            return None
        return 'macOS {} ({})'.format(version, machine())
    if platform == 'linux':
        # Ubuntu/Debian based
        desc = get_lsb_version(run_lambda)
        if desc is not None:
            return '{} ({})'.format(desc, machine())
        # Try reading /etc/*-release
        desc = check_release_file(run_lambda)
        if desc is not None:
            return '{} ({})'.format(desc, machine())
        return '{} ({})'.format(platform, machine())
    # Unknown platform
    return platform
 def get_python_platform():
    import platform
    return platform.platform()
 def get_libc_version():
    import platform
    if get_platform() != 'linux':
        return 'N/A'
    return '-'.join(platform.libc_ver())
 def get_pip_packages(run_lambda, patterns=None):
    """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages."""
    if patterns is None:
        patterns = DEFAULT_PIP_PATTERNS
    # People generally have `pip` as `pip` or `pip3`
    # But here it is invoked as `python -mpip`
    def run_with_pip(pip):
        out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"])
        return "\n".join(line for line in out.splitlines()
                         if any(name in line for name in patterns))
    pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
    out = run_with_pip([sys.executable, '-mpip'])
    return pip_version, out
 def get_cachingallocator_config():
    ca_config = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', '')
    return ca_config
 def get_cuda_module_loading_config():
    if TORCH_AVAILABLE and torch.cuda.is_available():
        torch.cuda.init()
        config = os.environ.get('CUDA_MODULE_LOADING', '')
        return config
    else:
        return "N/A"
 def is_xnnpack_available():
    if TORCH_AVAILABLE:
        import torch.backends.xnnpack
        return str(
            torch.backends.xnnpack.enabled)  # type: ignore[attr-defined]
    else:
        return "N/A"
 def get_env_info():
    run_lambda = run
    pip_version, pip_list_output = get_pip_packages(run_lambda)
    if TORCH_AVAILABLE:
        version_str = torch.__version__
        debug_mode_str = str(torch.version.debug)
        cuda_available_str = str(torch.cuda.is_available())
        cuda_version_str = torch.version.cuda
        if not hasattr(torch.version,
                       'hip') or torch.version.hip is None:  # cuda version
            hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A'
        else:  # HIP version
            def get_version_or_na(cfg, prefix):
                _lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s]
                return _lst[0] if _lst else 'N/A'
            cfg = torch._C._show_config().split('\n')
            hip_runtime_version = get_version_or_na(cfg, 'HIP Runtime')
            miopen_runtime_version = get_version_or_na(cfg, 'MIOpen')
            cuda_version_str = 'N/A'
            hip_compiled_version = torch.version.hip
    else:
        version_str = debug_mode_str = cuda_available_str = cuda_version_str = 'N/A'
        hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A'
    sys_version = sys.version.replace("\n", " ")
    conda_packages = get_conda_packages(run_lambda)
    rocm_version = get_rocm_version(run_lambda)
    neuron_sdk_version = get_neuron_sdk_version(run_lambda)
    vllm_version = get_vllm_version()
    vllm_build_flags = summarize_vllm_build_flags()
    gpu_topo = get_gpu_topo(run_lambda)
    return SystemEnv(
        torch_version=version_str,
        is_debug_build=debug_mode_str,
        python_version='{} ({}-bit runtime)'.format(
            sys_version,
            sys.maxsize.bit_length() + 1),
        python_platform=get_python_platform(),
        is_cuda_available=cuda_available_str,
        cuda_compiled_version=cuda_version_str,
        cuda_runtime_version=get_running_cuda_version(run_lambda),
        cuda_module_loading=get_cuda_module_loading_config(),
        nvidia_gpu_models=get_gpu_info(run_lambda),
        nvidia_driver_version=get_nvidia_driver_version(run_lambda),
        cudnn_version=get_cudnn_version(run_lambda),
        hip_compiled_version=hip_compiled_version,
        hip_runtime_version=hip_runtime_version,
        miopen_runtime_version=miopen_runtime_version,
        pip_version=pip_version,
        pip_packages=pip_list_output,
        conda_packages=conda_packages,
        os=get_os(run_lambda),
        libc_version=get_libc_version(),
        gcc_version=get_gcc_version(run_lambda),
        clang_version=get_clang_version(run_lambda),
        cmake_version=get_cmake_version(run_lambda),
        caching_allocator_config=get_cachingallocator_config(),
        is_xnnpack_available=is_xnnpack_available(),
        cpu_info=get_cpu_info(run_lambda),
        rocm_version=rocm_version,
        neuron_sdk_version=neuron_sdk_version,
        vllm_version=vllm_version,
        vllm_build_flags=vllm_build_flags,
        gpu_topo=gpu_topo,
    )
 env_info_fmt = """
 PyTorch version: {torch_version}
 Is debug build: {is_debug_build}
 CUDA used to build PyTorch: {cuda_compiled_version}
 ROCM used to build PyTorch: {hip_compiled_version}
 OS: {os}
 GCC version: {gcc_version}
 Clang version: {clang_version}
 CMake version: {cmake_version}
 Libc version: {libc_version}
 Python version: {python_version}
 Python platform: {python_platform}
 Is CUDA available: {is_cuda_available}
 CUDA runtime version: {cuda_runtime_version}
 CUDA_MODULE_LOADING set to: {cuda_module_loading}
 GPU models and configuration: {nvidia_gpu_models}
 Nvidia driver version: {nvidia_driver_version}
 cuDNN version: {cudnn_version}
 HIP runtime version: {hip_runtime_version}
 MIOpen runtime version: {miopen_runtime_version}
 Is XNNPACK available: {is_xnnpack_available}
 CPU:
 {cpu_info}
 Versions of relevant libraries:
 {pip_packages}
 {conda_packages}
 """.strip()
 env_info_fmt += """
 ROCM Version: {rocm_version}
 Neuron SDK Version: {neuron_sdk_version}
 vLLM Version: {vllm_version}
 vLLM Build Flags:
 {vllm_build_flags}
 GPU Topology:
 {gpu_topo}
 """.strip()
 def pretty_str(envinfo):
    def replace_nones(dct, replacement='Could not collect'):
        for key in dct.keys():
            if dct[key] is not None:
                continue
            dct[key] = replacement
        return dct
    def replace_bools(dct, true='Yes', false='No'):
        for key in dct.keys():
            if dct[key] is True:
                dct[key] = true
            elif dct[key] is False:
                dct[key] = false
        return dct
    def prepend(text, tag='[prepend]'):
        lines = text.split('\n')
        updated_lines = [tag + line for line in lines]
        return '\n'.join(updated_lines)
    def replace_if_empty(text, replacement='No relevant packages'):
        if text is not None and len(text) == 0:
            return replacement
        return text
    def maybe_start_on_next_line(string):
        # If `string` is multiline, prepend a \n to it.
        if string is not None and len(string.split('\n')) > 1:
            return '\n{}\n'.format(string)
        return string
    mutable_dict = envinfo._asdict()
    # If nvidia_gpu_models is multiline, start on the next line
    mutable_dict['nvidia_gpu_models'] = \
        maybe_start_on_next_line(envinfo.nvidia_gpu_models)
    # If the machine doesn't have CUDA, report some fields as 'No CUDA'
    dynamic_cuda_fields = [
        'cuda_runtime_version',
        'nvidia_gpu_models',
        'nvidia_driver_version',
    ]
    all_cuda_fields = dynamic_cuda_fields + ['cudnn_version']
    all_dynamic_cuda_fields_missing = all(mutable_dict[field] is None
                                          for field in dynamic_cuda_fields)
    if TORCH_AVAILABLE and not torch.cuda.is_available(
    ) and all_dynamic_cuda_fields_missing:
        for field in all_cuda_fields:
            mutable_dict[field] = 'No CUDA'
        if envinfo.cuda_compiled_version is None:
            mutable_dict['cuda_compiled_version'] = 'None'
    # Replace True with Yes, False with No
    mutable_dict = replace_bools(mutable_dict)
    # Replace all None objects with 'Could not collect'
    mutable_dict = replace_nones(mutable_dict)
    # If either of these are '', replace with 'No relevant packages'
    mutable_dict['pip_packages'] = replace_if_empty(
        mutable_dict['pip_packages'])
    mutable_dict['conda_packages'] = replace_if_empty(
        mutable_dict['conda_packages'])
    # Tag conda and pip packages with a prefix
    # If they were previously None, they'll show up as ie '[conda] Could not collect'
    if mutable_dict['pip_packages']:
        mutable_dict['pip_packages'] = prepend(
            mutable_dict['pip_packages'], '[{}] '.format(envinfo.pip_version))
    if mutable_dict['conda_packages']:
        mutable_dict['conda_packages'] = prepend(
            mutable_dict['conda_packages'], '[conda] ')
    mutable_dict['cpu_info'] = envinfo.cpu_info
    return env_info_fmt.format(**mutable_dict)
 def get_pretty_env_info():
    return pretty_str(get_env_info())
 def main():
    print("Collecting environment information...")
    output = get_pretty_env_info()
    print(output)
    if TORCH_AVAILABLE and hasattr(torch, 'utils') and hasattr(
            torch.utils, '_crash_handler'):
        minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR
        if sys.platform == "linux" and os.path.exists(minidump_dir):
            dumps = [
                os.path.join(minidump_dir, dump)
                for dump in os.listdir(minidump_dir)
            ]
            latest = max(dumps, key=os.path.getctime)
            ctime = os.path.getctime(latest)
            creation_time = datetime.datetime.fromtimestamp(ctime).strftime(
                '%Y-%m-%d %H:%M:%S')
            msg = "\n*** Detected a minidump at {} created on {}, ".format(latest, creation_time) + \
                  "if this is related to your bug please include it when you file a report ***"
            print(msg, file=sys.stderr)
 if __name__ == '__main__':
    main()
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -33,12 +33,25 @@ template<typename T>
 __device__ __forceinline__ T gelu_kernel(const T& x) {
  // Equivalent to PyTorch GELU with 'none' approximation.
  // Refer to:
-  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L38
+  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
  const float f = (float) x;
  constexpr float ALPHA = M_SQRT1_2;
  return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA)));
 }
 template<typename T>
 __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
  // Equivalent to PyTorch GELU with 'tanh' approximation.
  // Refer to:
  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
  const float f = (float) x;
  constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
  constexpr float KAPPA = 0.044715;
  float x_cube = f * f * f;
  float inner = BETA * (f + KAPPA * x_cube);
  return (T) (0.5f * f * (1.0f + ::tanhf(inner)));
 }
 } // namespace vllm
 // Launch activation and gating kernel.
@@ -73,6 +86,13 @@ void gelu_and_mul(
  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
 }
 void gelu_tanh_and_mul(
  torch::Tensor& out,      // [..., d]
  torch::Tensor& input)    // [..., 2 * d]
 {
  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
 }
 namespace vllm {
 // Element-wise activation kernel template.
--- a/csrc/attention/attention_dtypes.h
+++ b/csrc/attention/attention_dtypes.h
@@ -4,4 +4,4 @@
 #include "dtype_float16.cuh"
 #include "dtype_float32.cuh"
 #include "dtype_bfloat16.cuh"
-#include "dtype_fp8_e5m2.cuh"
+#include "dtype_fp8.cuh"
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -15,9 +15,6 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifdef USE_ROCM
 #include <hip/hip_runtime.h>
 #endif
 #include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -25,17 +22,26 @@
 #include "attention_dtypes.h"
 #include "attention_utils.cuh"
-#ifdef ENABLE_FP8_E5M2
+
 #if defined(ENABLE_FP8_E5M2)
 #include "../quantization/fp8_e5m2_kvcache/quant_utils.cuh"
 #elif defined(ENABLE_FP8_E4M3)
 #include "../quantization/fp8/amd_detail/quant_utils.cuh"
 #endif
 #include <algorithm>
 #ifdef USE_ROCM
  #include <hip/hip_bf16.h>
  typedef __hip_bfloat16 __nv_bfloat16;
 #endif
 #ifndef USE_ROCM
 #define WARP_SIZE 32
 #else
 #define WARP_SIZE warpSize
 #endif
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
@@ -86,7 +92,7 @@ template<
  int HEAD_SIZE,
  int BLOCK_SIZE,
  int NUM_THREADS,
-  bool IS_FP8_E5M2_KV_CACHE,
+  bool IS_FP8_KV_CACHE,
  int PARTITION_SIZE = 0> // Zero means no partitioning.
 __device__ void paged_attention_kernel(
  float* __restrict__ exp_sums,           // [num_seqs, num_heads, max_num_partitions]
@@ -98,33 +104,34 @@ __device__ void paged_attention_kernel(
  const int num_kv_heads,                 // [num_heads]
  const float scale,
  const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
-  const int* __restrict__ context_lens,   // [num_seqs]
+  const int* __restrict__ seq_lens,   // [num_seqs]
  const int max_num_blocks_per_seq,
  const float* __restrict__ alibi_slopes, // [num_heads]
  const int q_stride,
  const int kv_block_stride,
-  const int kv_head_stride) {
+  const int kv_head_stride,
  const float kv_scale) {
  const int seq_idx = blockIdx.y;
  const int partition_idx = blockIdx.z;
  const int max_num_partitions = gridDim.z;
  constexpr bool USE_PARTITIONING = PARTITION_SIZE > 0;
-  const int context_len = context_lens[seq_idx];
+  const int seq_len = seq_lens[seq_idx];
-  if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= context_len) {
+  if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= seq_len) {
    // No work to do. Terminate the thread block.
    return;
  }
-  const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
+  const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
-  const int num_blocks_per_partition = USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_context_blocks;
+  const int num_blocks_per_partition = USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_seq_blocks;
  // [start_block_idx, end_block_idx) is the range of blocks to process.
  const int start_block_idx = USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0;
-  const int end_block_idx = MIN(start_block_idx + num_blocks_per_partition, num_context_blocks);
+  const int end_block_idx = MIN(start_block_idx + num_blocks_per_partition, num_seq_blocks);
  const int num_blocks = end_block_idx - start_block_idx;
  // [start_token_idx, end_token_idx) is the range of tokens to process.
  const int start_token_idx = start_block_idx * BLOCK_SIZE;
-  const int end_token_idx = MIN(start_token_idx + num_blocks * BLOCK_SIZE, context_len);
+  const int end_token_idx = MIN(start_token_idx + num_blocks * BLOCK_SIZE, seq_len);
  const int num_tokens = end_token_idx - start_token_idx;
  constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
@@ -150,7 +157,7 @@ __device__ void paged_attention_kernel(
  constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(scalar_t)), 1);
  using K_vec = typename Vec<scalar_t, VEC_SIZE>::Type;
  using Q_vec = typename Vec<scalar_t, VEC_SIZE>::Type;
-#ifdef ENABLE_FP8_E5M2
+#if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3)
  using Quant_vec = typename Vec<cache_t, VEC_SIZE>::Type;
 #endif
@@ -216,11 +223,16 @@ __device__ void paged_attention_kernel(
        const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE;
        const int offset1 = (vec_idx * VEC_SIZE) / x;
        const int offset2 = (vec_idx * VEC_SIZE) % x;
-        if constexpr (IS_FP8_E5M2_KV_CACHE) {
+        if constexpr (IS_FP8_KV_CACHE) {
-#ifdef ENABLE_FP8_E5M2
+#if defined(ENABLE_FP8_E5M2)
          Quant_vec k_vec_quant = *reinterpret_cast<const Quant_vec*>(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
          // Vector conversion from Quant_vec to K_vec.
          k_vecs[j] = fp8_e5m2_unscaled::vec_conversion<K_vec, Quant_vec>(k_vec_quant);
 #elif defined(ENABLE_FP8_E4M3)
          Quant_vec k_vec_quant = *reinterpret_cast<const Quant_vec*>(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
          // Vector conversion from Quant_vec to K_vec. Use scaled_vec_conversion to convert FP8_E4M3 quantized k
          // cache vec to k vec in higher precision (FP16, BFloat16, etc.)
          k_vecs[j] = fp8_e4m3::scaled_vec_conversion<K_vec, Quant_vec>(k_vec_quant, kv_scale);
 #else
          assert(false);
 #endif
@@ -233,12 +245,12 @@ __device__ void paged_attention_kernel(
      // This includes a reduction across the threads in the same thread group.
      float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs);
      // Add the ALiBi bias if slopes are given.
-      qk += (alibi_slope != 0) ? alibi_slope * (token_idx - context_len + 1) : 0;
+      qk += (alibi_slope != 0) ? alibi_slope * (token_idx - seq_len + 1) : 0;
      if (thread_group_offset == 0) {
        // Store the partial reductions to shared memory.
        // NOTE(woosuk): It is required to zero out the masked logits.
-        const bool mask = token_idx >= context_len;
+        const bool mask = token_idx >= seq_len;
        logits[token_idx - start_token_idx] = mask ? 0.f : qk;
        // Update the max value.
        qk_max = mask ? qk_max : fmaxf(qk_max, qk);
@@ -300,7 +312,7 @@ __device__ void paged_attention_kernel(
  constexpr int V_VEC_SIZE = MIN(16 / sizeof(scalar_t), BLOCK_SIZE);
  using V_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type;
  using L_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type;
-#ifdef ENABLE_FP8_E5M2
+#if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3)
  using V_quant_vec = typename Vec<cache_t, V_VEC_SIZE>::Type;
 #endif
  using Float_L_vec = typename FloatVec<L_vec>::Type;
@@ -336,25 +348,30 @@ __device__ void paged_attention_kernel(
      if (row_idx < HEAD_SIZE) {
        const int offset = row_idx * BLOCK_SIZE + physical_block_offset;
        V_vec v_vec;
-        if constexpr (IS_FP8_E5M2_KV_CACHE) {
+        if constexpr (IS_FP8_KV_CACHE) {
-#ifdef ENABLE_FP8_E5M2
+#if defined(ENABLE_FP8_E5M2)
          V_quant_vec v_quant_vec = *reinterpret_cast<const V_quant_vec*>(v_ptr + offset);
          // Vector conversion from V_quant_vec to V_vec.
          v_vec = fp8_e5m2_unscaled::vec_conversion<V_vec, V_quant_vec>(v_quant_vec);
 #elif defined(ENABLE_FP8_E4M3)
          V_quant_vec v_quant_vec = *reinterpret_cast<const V_quant_vec*>(v_ptr + offset);
          // Vector conversion from V_quant_vec to V_vec. Use scaled_vec_conversion to convert
          // FP8_E4M3 quantized v cache vec to v vec in higher precision (FP16, BFloat16, etc.)
          v_vec = fp8_e4m3::scaled_vec_conversion<V_vec, V_quant_vec>(v_quant_vec, kv_scale);
 #else
          assert(false);
 #endif
        } else {
          v_vec = *reinterpret_cast<const V_vec*>(v_ptr + offset);
        }
-        if (block_idx == num_context_blocks - 1) {
+        if (block_idx == num_seq_blocks - 1) {
          // NOTE(woosuk): When v_vec contains the tokens that are out of the context,
          // we should explicitly zero out the values since they may contain NaNs.
          // See https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
          scalar_t* v_vec_ptr = reinterpret_cast<scalar_t*>(&v_vec);
 #pragma unroll
          for (int j = 0; j < V_VEC_SIZE; j++) {
-            v_vec_ptr[j] = token_idx + j < context_len ? v_vec_ptr[j] : zero_value;
+            v_vec_ptr[j] = token_idx + j < seq_len ? v_vec_ptr[j] : zero_value;
          }
        }
        accs[i] += dot(logits_vec, v_vec);
@@ -431,7 +448,7 @@ template<
  int HEAD_SIZE,
  int BLOCK_SIZE,
  int NUM_THREADS,
-  bool IS_FP8_E5M2_KV_CACHE>
+  bool IS_FP8_KV_CACHE>
 __global__ void paged_attention_v1_kernel(
  scalar_t* __restrict__ out,             // [num_seqs, num_heads, head_size]
  const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
@@ -440,16 +457,17 @@ __global__ void paged_attention_v1_kernel(
  const int num_kv_heads,                 // [num_heads]
  const float scale,
  const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
-  const int* __restrict__ context_lens,   // [num_seqs]
+  const int* __restrict__ seq_lens,   // [num_seqs]
  const int max_num_blocks_per_seq,
  const float* __restrict__ alibi_slopes, // [num_heads]
  const int q_stride,
  const int kv_block_stride,
-  const int kv_head_stride) {
+  const int kv_head_stride,
-  paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, IS_FP8_E5M2_KV_CACHE>(
+  const float kv_scale) {
  paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, IS_FP8_KV_CACHE>(
    /* exp_sums */ nullptr, /* max_logits */ nullptr,
-    out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, context_lens,
+    out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, seq_lens,
-    max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride);
+    max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride, kv_scale);
 }
 // Grid: (num_heads, num_seqs, max_num_partitions).
@@ -459,7 +477,7 @@ template<
  int HEAD_SIZE,
  int BLOCK_SIZE,
  int NUM_THREADS,
-  bool IS_FP8_E5M2_KV_CACHE,
+  bool IS_FP8_KV_CACHE,
  int PARTITION_SIZE>
 __global__ void paged_attention_v2_kernel(
  float* __restrict__ exp_sums,           // [num_seqs, num_heads, max_num_partitions]
@@ -471,16 +489,17 @@ __global__ void paged_attention_v2_kernel(
  const int num_kv_heads,                 // [num_heads]
  const float scale,
  const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
-  const int* __restrict__ context_lens,   // [num_seqs]
+  const int* __restrict__ seq_lens,   // [num_seqs]
  const int max_num_blocks_per_seq,
  const float* __restrict__ alibi_slopes, // [num_heads]
  const int q_stride,
  const int kv_block_stride,
-  const int kv_head_stride) {
+  const int kv_head_stride,
-  paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, IS_FP8_E5M2_KV_CACHE, PARTITION_SIZE>(
+  const float kv_scale) {
  paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, IS_FP8_KV_CACHE, PARTITION_SIZE>(
    exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
-    block_tables, context_lens, max_num_blocks_per_seq, alibi_slopes,
+    block_tables, seq_lens, max_num_blocks_per_seq, alibi_slopes,
-    q_stride, kv_block_stride, kv_head_stride);
+    q_stride, kv_block_stride, kv_head_stride, kv_scale);
 }
 // Grid: (num_heads, num_seqs).
@@ -494,13 +513,13 @@ __global__ void paged_attention_v2_reduce_kernel(
  const float* __restrict__ exp_sums,     // [num_seqs, num_heads, max_num_partitions]
  const float* __restrict__ max_logits,   // [num_seqs, num_heads, max_num_partitions]
  const scalar_t* __restrict__ tmp_out,   // [num_seqs, num_heads, max_num_partitions, head_size]
-  const int* __restrict__ context_lens,   // [num_seqs]
+  const int* __restrict__ seq_lens,   // [num_seqs]
  const int max_num_partitions) {
  const int num_heads = gridDim.x;
  const int head_idx = blockIdx.x;
  const int seq_idx = blockIdx.y;
-  const int context_len = context_lens[seq_idx];
+  const int seq_len = seq_lens[seq_idx];
-  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
+  const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE);
  if (num_partitions == 1) {
    // No need to reduce. Only copy tmp_out to out.
    scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
@@ -587,9 +606,9 @@ __global__ void paged_attention_v2_reduce_kernel(
 #define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE)                                                  \
  VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(                                       \
    ((void*)vllm::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,   \
-      IS_FP8_E5M2_KV_CACHE>), shared_mem_size);                                               \
+      IS_FP8_KV_CACHE>), shared_mem_size);                                                    \
  vllm::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,             \
-  IS_FP8_E5M2_KV_CACHE><<<grid, block, shared_mem_size, stream>>>(                            \
+  IS_FP8_KV_CACHE><<<grid, block, shared_mem_size, stream>>>(                                 \
    out_ptr,                                                                                  \
    query_ptr,                                                                                \
    key_cache_ptr,                                                                            \
@@ -597,19 +616,20 @@ __global__ void paged_attention_v2_reduce_kernel(
    num_kv_heads,                                                                             \
    scale,                                                                                    \
    block_tables_ptr,                                                                         \
-    context_lens_ptr,                                                                         \
+    seq_lens_ptr,                                                                              \
    max_num_blocks_per_seq,                                                                   \
    alibi_slopes_ptr,                                                                         \
    q_stride,                                                                                 \
    kv_block_stride,                                                                          \
-    kv_head_stride);
+    kv_head_stride,                                                                           \
    kv_scale);
 // TODO(woosuk): Tune NUM_THREADS.
 template<
  typename T,
  typename CACHE_T,
  int BLOCK_SIZE,
-  bool IS_FP8_E5M2_KV_CACHE,
+  bool IS_FP8_KV_CACHE,
  int NUM_THREADS = 128>
 void paged_attention_v1_launcher(
  torch::Tensor& out,
@@ -619,9 +639,10 @@ void paged_attention_v1_launcher(
  int num_kv_heads,
  float scale,
  torch::Tensor& block_tables,
-  torch::Tensor& context_lens,
+  torch::Tensor& seq_lens,
-  int max_context_len,
+  int max_seq_len,
-  const c10::optional<torch::Tensor>& alibi_slopes) {
+  const c10::optional<torch::Tensor>& alibi_slopes,
  float kv_scale) {
  int num_seqs = query.size(0);
  int num_heads = query.size(1);
  int head_size = query.size(2);
@@ -643,11 +664,11 @@ void paged_attention_v1_launcher(
  CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
  CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
  int* block_tables_ptr = block_tables.data_ptr<int>();
-  int* context_lens_ptr = context_lens.data_ptr<int>();
+  int* seq_lens_ptr = seq_lens.data_ptr<int>();
  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
-  int padded_max_context_len = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE) * BLOCK_SIZE;
+  int padded_max_seq_len = DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
-  int logits_size = padded_max_context_len * sizeof(float);
+  int logits_size = padded_max_seq_len * sizeof(float);
  int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
  // Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len
  // Keep that in sync with the logic here!
@@ -685,8 +706,8 @@ void paged_attention_v1_launcher(
  }
 }
-#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE)       \
+#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE)            \
-  paged_attention_v1_launcher<T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE>( \
+  paged_attention_v1_launcher<T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE>(      \
    out,                                                                     \
    query,                                                                   \
    key_cache,                                                               \
@@ -694,22 +715,23 @@ void paged_attention_v1_launcher(
    num_kv_heads,                                                            \
    scale,                                                                   \
    block_tables,                                                            \
-    context_lens,                                                            \
+    seq_lens,                                                            \
-    max_context_len,                                                         \
+    max_seq_len,                                                         \
-    alibi_slopes);
+    alibi_slopes,                                                            \
    kv_scale);
 // NOTE(woosuk): To reduce the compilation time, we omitted block sizes
 // 1, 2, 4, 64, 128, 256.
-#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_E5M2_KV_CACHE) \
+#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_KV_CACHE)      \
  switch (block_size) {                                               \
    case 8:                                                           \
-      CALL_V1_LAUNCHER(T, CACHE_T, 8, IS_FP8_E5M2_KV_CACHE);          \
+      CALL_V1_LAUNCHER(T, CACHE_T, 8, IS_FP8_KV_CACHE);               \
      break;                                                          \
    case 16:                                                          \
-      CALL_V1_LAUNCHER(T, CACHE_T, 16, IS_FP8_E5M2_KV_CACHE);         \
+      CALL_V1_LAUNCHER(T, CACHE_T, 16, IS_FP8_KV_CACHE);              \
      break;                                                          \
    case 32:                                                          \
-      CALL_V1_LAUNCHER(T, CACHE_T, 32, IS_FP8_E5M2_KV_CACHE);         \
+      CALL_V1_LAUNCHER(T, CACHE_T, 32, IS_FP8_KV_CACHE);              \
      break;                                                          \
    default:                                                          \
      TORCH_CHECK(false, "Unsupported block size: ", block_size);     \
@@ -724,11 +746,12 @@ void paged_attention_v1(
  int num_kv_heads,               // [num_heads]
  float scale,
  torch::Tensor& block_tables,    // [num_seqs, max_num_blocks_per_seq]
-  torch::Tensor& context_lens,    // [num_seqs]
+  torch::Tensor& seq_lens,    // [num_seqs]
  int block_size,
-  int max_context_len,
+  int max_seq_len,
  const c10::optional<torch::Tensor>& alibi_slopes,
-  const std::string& kv_cache_dtype) {
+  const std::string& kv_cache_dtype,
  float kv_scale) {
  if (kv_cache_dtype == "auto") {
    if (query.dtype() == at::ScalarType::Float) {
      CALL_V1_LAUNCHER_BLOCK_SIZE(float, float, false);
@@ -739,7 +762,7 @@ void paged_attention_v1(
    } else {
      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
    }
-  } else if (kv_cache_dtype == "fp8_e5m2") {
+  } else if (kv_cache_dtype == "fp8") {
    if (query.dtype() == at::ScalarType::Float) {
      CALL_V1_LAUNCHER_BLOCK_SIZE(float, uint8_t, true);
    } else if (query.dtype() == at::ScalarType::Half) {
@@ -756,7 +779,7 @@ void paged_attention_v1(
 #define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE)                                                  \
  vllm::paged_attention_v2_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,             \
-  IS_FP8_E5M2_KV_CACHE, PARTITION_SIZE>                                                       \
+  IS_FP8_KV_CACHE, PARTITION_SIZE>                                                            \
  <<<grid, block, shared_mem_size, stream>>>(                                                 \
    exp_sums_ptr,                                                                             \
    max_logits_ptr,                                                                           \
@@ -767,26 +790,27 @@ void paged_attention_v1(
    num_kv_heads,                                                                             \
    scale,                                                                                    \
    block_tables_ptr,                                                                         \
-    context_lens_ptr,                                                                         \
+    seq_lens_ptr,                                                                         \
    max_num_blocks_per_seq,                                                                   \
    alibi_slopes_ptr,                                                                         \
    q_stride,                                                                                 \
    kv_block_stride,                                                                          \
-    kv_head_stride);                                                                          \
+    kv_head_stride,                                                                           \
    kv_scale);                                                                                \
  vllm::paged_attention_v2_reduce_kernel<T, HEAD_SIZE, NUM_THREADS, PARTITION_SIZE>           \
  <<<reduce_grid, block, reduce_shared_mem_size, stream>>>(                                   \
    out_ptr,                                                                                  \
    exp_sums_ptr,                                                                             \
    max_logits_ptr,                                                                           \
    tmp_out_ptr,                                                                              \
-    context_lens_ptr,                                                                         \
+    seq_lens_ptr,                                                                         \
    max_num_partitions);
 template<
  typename T,
  typename CACHE_T,
  int BLOCK_SIZE,
-  bool IS_FP8_E5M2_KV_CACHE,
+  bool IS_FP8_KV_CACHE,
  int NUM_THREADS = 128,
  int PARTITION_SIZE = 512>
 void paged_attention_v2_launcher(
@@ -800,9 +824,10 @@ void paged_attention_v2_launcher(
  int num_kv_heads,
  float scale,
  torch::Tensor& block_tables,
-  torch::Tensor& context_lens,
+  torch::Tensor& seq_lens,
-  int max_context_len,
+  int max_seq_len,
-  const c10::optional<torch::Tensor>& alibi_slopes) {
+  const c10::optional<torch::Tensor>& alibi_slopes,
  float kv_scale) {
  int num_seqs = query.size(0);
  int num_heads = query.size(1);
  int head_size = query.size(2);
@@ -827,10 +852,10 @@ void paged_attention_v2_launcher(
  CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
  CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
  int* block_tables_ptr = block_tables.data_ptr<int>();
-  int* context_lens_ptr = context_lens.data_ptr<int>();
+  int* seq_lens_ptr = seq_lens.data_ptr<int>();
  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
-  int max_num_partitions = DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
+  int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
  int logits_size = PARTITION_SIZE * sizeof(float);
  int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
@@ -872,8 +897,8 @@ void paged_attention_v2_launcher(
  }
 }
-#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE)           \
+#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE)                \
-  paged_attention_v2_launcher<T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE>(     \
+  paged_attention_v2_launcher<T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE>(          \
    out,                                                                         \
    exp_sums,                                                                    \
    max_logits,                                                                  \
@@ -884,22 +909,23 @@ void paged_attention_v2_launcher(
    num_kv_heads,                                                                \
    scale,                                                                       \
    block_tables,                                                                \
-    context_lens,                                                                \
+    seq_lens,                                                                \
-    max_context_len,                                                             \
+    max_seq_len,                                                             \
-    alibi_slopes);
+    alibi_slopes,                                                                \
    kv_scale);
 // NOTE(woosuk): To reduce the compilation time, we omitted block sizes
 // 1, 2, 4, 64, 128, 256.
-#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_E5M2_KV_CACHE)       \
+#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_KV_CACHE)            \
  switch (block_size) {                                                     \
    case 8:                                                                 \
-      CALL_V2_LAUNCHER(T, CACHE_T, 8, IS_FP8_E5M2_KV_CACHE);                \
+      CALL_V2_LAUNCHER(T, CACHE_T, 8, IS_FP8_KV_CACHE);                     \
      break;                                                                \
    case 16:                                                                \
-      CALL_V2_LAUNCHER(T, CACHE_T, 16, IS_FP8_E5M2_KV_CACHE);               \
+      CALL_V2_LAUNCHER(T, CACHE_T, 16, IS_FP8_KV_CACHE);                    \
      break;                                                                \
    case 32:                                                                \
-      CALL_V2_LAUNCHER(T, CACHE_T, 32, IS_FP8_E5M2_KV_CACHE);               \
+      CALL_V2_LAUNCHER(T, CACHE_T, 32, IS_FP8_KV_CACHE);                    \
      break;                                                                \
    default:                                                                \
      TORCH_CHECK(false, "Unsupported block size: ", block_size);           \
@@ -917,11 +943,12 @@ void paged_attention_v2(
  int num_kv_heads,               // [num_heads]
  float scale,
  torch::Tensor& block_tables,    // [num_seqs, max_num_blocks_per_seq]
-  torch::Tensor& context_lens,    // [num_seqs]
+  torch::Tensor& seq_lens,    // [num_seqs]
  int block_size,
-  int max_context_len,
+  int max_seq_len,
  const c10::optional<torch::Tensor>& alibi_slopes,
-  const std::string& kv_cache_dtype) {
+  const std::string& kv_cache_dtype,
  float kv_scale) {
  if (kv_cache_dtype == "auto") {
    if (query.dtype() == at::ScalarType::Float) {
      CALL_V2_LAUNCHER_BLOCK_SIZE(float, float, false);
@@ -932,7 +959,7 @@ void paged_attention_v2(
    } else {
      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
    }
-  } else if (kv_cache_dtype == "fp8_e5m2") {
+  } else if (kv_cache_dtype == "fp8") {
    if (query.dtype() == at::ScalarType::Float) {
      CALL_V2_LAUNCHER_BLOCK_SIZE(float, uint8_t, true);
    } else if (query.dtype() == at::ScalarType::Half) {
--- a/csrc/attention/dtype_fp8_e5m2.cuh
+++ b/csrc/attention/dtype_fp8_e5m2.cuh
@@ -8,7 +8,7 @@
 #endif
 namespace vllm {
-#ifdef ENABLE_FP8_E5M2
+#if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3)
 // fp8 vector types for quantization of kv cache
 template<>
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -16,6 +16,15 @@ void copy_blocks(
  const std::map<int64_t, std::vector<int64_t>>& block_mapping);
 void reshape_and_cache(
  torch::Tensor& key,
  torch::Tensor& value,
  torch::Tensor& key_cache,
  torch::Tensor& value_cache,
  torch::Tensor& slot_mapping,
  const std::string& kv_cache_dtype,
  const float kv_scale);
 void reshape_and_cache_flash(
  torch::Tensor& key,
  torch::Tensor& value,
  torch::Tensor& key_cache,
@@ -24,6 +33,6 @@ void reshape_and_cache(
  const std::string& kv_cache_dtype);
 // Just for unittest
-void convert_fp8_e5m2(
+void convert_fp8(
  torch::Tensor& src_cache,
  torch::Tensor& dst_cache);
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -4,8 +4,10 @@
 #include "cuda_compat.h"
 #include "dispatch_utils.h"
-#ifdef ENABLE_FP8_E5M2
+#if defined(ENABLE_FP8_E5M2)
 #include "quantization/fp8_e5m2_kvcache/quant_utils.cuh"
 #elif defined(ENABLE_FP8_E4M3)
 #include "quantization/fp8/amd_detail/quant_utils.cuh"
 #endif
 #include <algorithm>
@@ -151,7 +153,7 @@ void copy_blocks(
 namespace vllm {
-template<typename scalar_t, typename cache_t, bool is_fp8_e5m2_kv_cache>
+template<typename scalar_t, typename cache_t, bool is_fp8_kv_cache>
 __global__ void reshape_and_cache_kernel(
  const scalar_t* __restrict__ key,           // [num_tokens, num_heads, head_size]
  const scalar_t* __restrict__ value,         // [num_tokens, num_heads, head_size]
@@ -163,7 +165,8 @@ __global__ void reshape_and_cache_kernel(
  const int num_heads,
  const int head_size,
  const int block_size,
-  const int x) {
+  const int x,
  const float kv_scale) {
  const int64_t token_idx = blockIdx.x;
  const int64_t slot_idx = slot_mapping[token_idx];
  if (slot_idx < 0) {
@@ -195,10 +198,13 @@ __global__ void reshape_and_cache_kernel(
                                  + block_offset;
    scalar_t tgt_key = key[src_key_idx];
    scalar_t tgt_value = value[src_value_idx];
-    if constexpr (is_fp8_e5m2_kv_cache) {
+    if constexpr (is_fp8_kv_cache) {
-#ifdef ENABLE_FP8_E5M2
+#if defined(ENABLE_FP8_E5M2)
      key_cache[tgt_key_idx] = fp8_e5m2_unscaled::vec_conversion<uint8_t, scalar_t>(tgt_key);
      value_cache[tgt_value_idx] = fp8_e5m2_unscaled::vec_conversion<uint8_t, scalar_t>(tgt_value);
 #elif defined(ENABLE_FP8_E4M3)
      key_cache[tgt_key_idx] = fp8_e4m3::scaled_vec_conversion<uint8_t, scalar_t>(tgt_key, kv_scale);
      value_cache[tgt_value_idx] = fp8_e4m3::scaled_vec_conversion<uint8_t, scalar_t>(tgt_value, kv_scale);
 #else
      assert(false);
 #endif
@@ -209,10 +215,45 @@ __global__ void reshape_and_cache_kernel(
  }
 }
 template<typename scalar_t>
 __global__ void reshape_and_cache_flash_kernel(
  const scalar_t* __restrict__ key,           // [num_tokens, num_heads, head_size]
  const scalar_t* __restrict__ value,         // [num_tokens, num_heads, head_size]
  scalar_t* __restrict__ k_cache,             // [num_blocks, block_size, num_heads, head_size]
  scalar_t* __restrict__ v_cache,             // [num_blocks, block_size, num_heads, head_size]
  const int64_t* __restrict__ slot_mapping,   // [num_tokens]
  const int block_stride,
  const int key_stride,
  const int value_stride,
  const int num_heads,
  const int head_size,
  const int block_size) {
  const int64_t token_idx = blockIdx.x;
  const int64_t slot_idx = slot_mapping[token_idx];
  // NOTE: slot_idx can be -1 if the token is padded
  if (slot_idx < 0) {
    return;
  }
  const int64_t block_idx = slot_idx / block_size;
  const int64_t block_offset = slot_idx % block_size;
  const int n = num_heads * head_size;
  for (int i = threadIdx.x; i < n; i += blockDim.x) {
    const int64_t src_key_idx = token_idx * key_stride + i;
    const int64_t src_value_idx = token_idx * value_stride + i;
    const int head_idx = i / head_size;
    const int head_offset = i % head_size;
    const int64_t tgt_value_idx = block_idx * block_stride
                              + block_offset * num_heads * head_size
                              + head_idx * head_size
                              + head_offset;
    k_cache[tgt_value_idx] = key[src_key_idx];
    v_cache[tgt_value_idx] = value[src_value_idx];
  }
 }
 } // namespace vllm
-#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, IS_FP8_E5M2_KV_CACHE)                                \
+#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, IS_FP8_KV_CACHE)                                     \
-  vllm::reshape_and_cache_kernel<KV_T, CACHE_T, IS_FP8_E5M2_KV_CACHE><<<grid, block, 0, stream>>>( \
+  vllm::reshape_and_cache_kernel<KV_T, CACHE_T, IS_FP8_KV_CACHE><<<grid, block, 0, stream>>>(      \
    reinterpret_cast<KV_T*>(key.data_ptr()),                                                       \
    reinterpret_cast<KV_T*>(value.data_ptr()),                                                     \
    reinterpret_cast<CACHE_T*>(key_cache.data_ptr()),                                              \
@@ -223,7 +264,8 @@ __global__ void reshape_and_cache_kernel(
    num_heads,                                                                                     \
    head_size,                                                                                     \
    block_size,                                                                                    \
-    x);
+    x,                                                                                             \
    kv_scale);
 void reshape_and_cache(
  torch::Tensor& key,           // [num_tokens, num_heads, head_size]
@@ -231,7 +273,8 @@ void reshape_and_cache(
  torch::Tensor& key_cache,     // [num_blocks, num_heads, head_size/x, block_size, x]
  torch::Tensor& value_cache,   // [num_blocks, num_heads, head_size, block_size]
  torch::Tensor& slot_mapping,  // [num_tokens]
-  const std::string& kv_cache_dtype)
+  const std::string& kv_cache_dtype,
  const float kv_scale)
 {
  int num_tokens = key.size(0);
  int num_heads = key.size(1);
@@ -254,7 +297,7 @@ void reshape_and_cache(
    } else if (key.dtype() == at::ScalarType::BFloat16) {
      CALL_RESHAPE_AND_CACHE(__nv_bfloat16, __nv_bfloat16, false);
    }
-  } else if (kv_cache_dtype == "fp8_e5m2") {
+  } else if (kv_cache_dtype == "fp8") {
    if (key.dtype() == at::ScalarType::Float) {
      CALL_RESHAPE_AND_CACHE(float, uint8_t, true);
    } else if (key.dtype() == at::ScalarType::Half) {
@@ -267,18 +310,65 @@ void reshape_and_cache(
  }
 }
 void reshape_and_cache_flash(
  torch::Tensor& key,           // [num_tokens, num_heads, head_size]
  torch::Tensor& value,         // [num_tokens, num_heads, head_size]
  torch::Tensor& k_cache,       // [num_blocks, block_size, num_heads, head_size]
  torch::Tensor& v_cache,       // [num_blocks, block_size, num_heads, head_size]
  torch::Tensor& slot_mapping,  // [num_tokens]
  const std::string& kv_cache_dtype)
 {
  // FIXME: only support auto datatype, does not support fp8
  if (kv_cache_dtype != "auto") {
    TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
  }
  int num_tokens = key.size(0);
  int num_heads = key.size(1);
  int head_size = key.size(2);
  int block_size = k_cache.size(1);
  int key_stride = key.stride(0);
  int value_stride = value.stride(0);
  int block_stride = k_cache.stride(0);
  TORCH_CHECK(k_cache.stride(0) == v_cache.stride(0));
  dim3 grid(num_tokens);
  dim3 block(std::min(num_heads * head_size, 512));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(
    key.scalar_type(),
    "reshape_and_cache_flash",
    [&] {
      vllm::reshape_and_cache_flash_kernel<scalar_t><<<grid, block, 0, stream>>>(
        key.data_ptr<scalar_t>(),
        value.data_ptr<scalar_t>(),
        k_cache.data_ptr<scalar_t>(),
        v_cache.data_ptr<scalar_t>(),
        slot_mapping.data_ptr<int64_t>(),
        block_stride,
        key_stride,
        value_stride,
        num_heads,
        head_size,
        block_size);
    });
 }
 namespace vllm {
 template<typename Tout, typename Tin>
-__global__ void convert_fp8_e5m2_kernel(
+__global__ void convert_fp8_kernel(
  const Tin* __restrict__ src_cache,
  Tout* __restrict__ dst_cache,
  const int64_t block_stride) {
  const int64_t block_idx = blockIdx.x;
  for (int i = threadIdx.x; i < block_stride; i += blockDim.x) {
    int64_t idx = block_idx * block_stride + i;
-#ifdef ENABLE_FP8_E5M2
+#if defined(ENABLE_FP8_E5M2)
    dst_cache[idx] = fp8_e5m2_unscaled::vec_conversion<Tout, Tin>(src_cache[idx]);
 #elif defined(ENABLE_FP8_E4M3)
    dst_cache[idx] = fp8_e4m3::vec_conversion<Tout, Tin>(src_cache[idx]);
 #else
    assert(false);
 #endif
@@ -287,16 +377,25 @@ __global__ void convert_fp8_e5m2_kernel(
 } // namespace vllm
-#define CALL_CONVERT_FP8_E5M2(Tout, Tin)                                 \
+#define CALL_CONVERT_FP8(Tout, Tin)                                 \
-  vllm::convert_fp8_e5m2_kernel<Tout, Tin><<<grid, block, 0, stream>>>(  \
+  vllm::convert_fp8_kernel<Tout, Tin><<<grid, block, 0, stream>>>(  \
-    reinterpret_cast<Tin*>(src_cache.data_ptr()),                        \
+    reinterpret_cast<Tin*>(src_cache.data_ptr()),                   \
-    reinterpret_cast<Tout*>(dst_cache.data_ptr()),                       \
+    reinterpret_cast<Tout*>(dst_cache.data_ptr()),                  \
    block_stride);
-void convert_fp8_e5m2(
+void convert_fp8(
  torch::Tensor& src_cache,
  torch::Tensor& dst_cache)
 {
  torch::Device src_device = src_cache.device();
  torch::Device dst_device = dst_cache.device();
  TORCH_CHECK(src_device.is_cuda(), "src must be on a GPU")
  TORCH_CHECK(dst_device.is_cuda(), "dst must be on a GPU")
  TORCH_CHECK(
    src_device.index() == dst_device.index(),
    "src and dst must be on the same GPU");
  at::cuda::OptionalCUDAGuard device_guard(src_device);
  int64_t num_blocks = src_cache.size(0);
  int64_t block_stride = src_cache.stride(0);
@@ -305,16 +404,16 @@ void convert_fp8_e5m2(
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  if (src_cache.dtype() == at::ScalarType::Float) {
-    CALL_CONVERT_FP8_E5M2(uint8_t, float);
+    CALL_CONVERT_FP8(uint8_t, float);
  } else if (src_cache.dtype() == at::ScalarType::Half) {
-    CALL_CONVERT_FP8_E5M2(uint8_t, uint16_t);
+    CALL_CONVERT_FP8(uint8_t, uint16_t);
  } else if (src_cache.dtype() == at::ScalarType::BFloat16) {
-    CALL_CONVERT_FP8_E5M2(uint8_t, __nv_bfloat16);
+    CALL_CONVERT_FP8(uint8_t, __nv_bfloat16);
  } else if (dst_cache.dtype() == at::ScalarType::Float) {
-    CALL_CONVERT_FP8_E5M2(float, uint8_t);
+    CALL_CONVERT_FP8(float, uint8_t);
  } else if (dst_cache.dtype() == at::ScalarType::Half) {
-    CALL_CONVERT_FP8_E5M2(uint16_t, uint8_t);
+    CALL_CONVERT_FP8(uint16_t, uint8_t);
  } else if (dst_cache.dtype() == at::ScalarType::BFloat16) {
-    CALL_CONVERT_FP8_E5M2(__nv_bfloat16, uint8_t);
+    CALL_CONVERT_FP8(__nv_bfloat16, uint8_t);
  }
 }
--- a/csrc/cpu/activation.cpp
+++ b/csrc/cpu/activation.cpp
@@ -0,0 +1,148 @@
 #include "cpu_types.hpp"
 namespace {
 template <typename scalar_t, vec_op::FP32Vec8 (*func)(const vec_op::FP32Vec8 &),
          bool is_gated>
 void activation_kernel(int num_tokens, int d, scalar_t *__restrict__ input,
                       scalar_t *__restrict__ output) {
  using scalar_vec_t = vec_op::vec_t<scalar_t>;
  constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
  TORCH_CHECK(d % VEC_ELEM_NUM == 0);
 #pragma omp parallel for
  for (int i = 0; i < num_tokens; ++i) {
    for (int j = 0; j < d; j += VEC_ELEM_NUM) {
      int start = i * d;
      if constexpr (is_gated) {
        start *= 2;
      }
      const scalar_vec_t x(input + start + j);
      const vec_op::FP32Vec8 f32_x(x);
      vec_op::FP32Vec8 f32_ans = func(f32_x);
      if constexpr (is_gated) {
        const scalar_vec_t y(input + start + d + j);
        const vec_op::FP32Vec8 f32_y(y);
        f32_ans = f32_y * f32_ans;
      }
      const scalar_vec_t result(f32_ans);
      result.save(output + i * d + j);
    }
  }
 }
 FORCE_INLINE vec_op::FP32Vec8 silu_act(const vec_op::FP32Vec8 &x) {
  const vec_op::FP32Vec8 zeros(0.0);
  const vec_op::FP32Vec8 ones(1.0);
  return x / (ones + (zeros - x).exp());
 }
 FORCE_INLINE vec_op::FP32Vec8 gelu_new_act(const vec_op::FP32Vec8 &x) {
  const vec_op::FP32Vec8 ones(1.0);
  const vec_op::FP32Vec8 w1(0.79788456f);
  const vec_op::FP32Vec8 w2(0.044715f);
  const vec_op::FP32Vec8 w3(0.5);
  const vec_op::FP32Vec8 x3 = x * x * x;
  const vec_op::FP32Vec8 t = (w1 * (x + w2 * x3)).tanh();
  return w3 * x * (ones + t);
 }
 FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8 &x) {
  const vec_op::FP32Vec8 ones(1.0);
  const vec_op::FP32Vec8 w1(0.79788456f);
  const vec_op::FP32Vec8 w2(0.044715f);
  const vec_op::FP32Vec8 w3(0.5);
  const vec_op::FP32Vec8 t = (x * w1 * (ones + x * w2 * x)).tanh();
  return w3 * x * (ones + t);
 }
 FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8 &x) {
  const vec_op::FP32Vec8 ones(1.0);
  const vec_op::FP32Vec8 w1(M_SQRT1_2);
  const vec_op::FP32Vec8 w2(0.5);
  return x * w2 * (ones + (x * w1).er());
 }
 FORCE_INLINE vec_op::FP32Vec8 gelu_tanh_act(const vec_op::FP32Vec8 &x) {
  const vec_op::FP32Vec8 ones(1.0);
  const vec_op::FP32Vec8 w1(M_SQRT2 * M_2_SQRTPI * 0.5);
  const vec_op::FP32Vec8 w2(0.5);
  const vec_op::FP32Vec8 w3(0.044715);
  const vec_op::FP32Vec8 x_3 = x * x * x;
  const vec_op::FP32Vec8 inner = w1 * (x + x_3 * w3);
  return x * w2 * (ones + inner.tanh());
 }
 }; // namespace
 void silu_and_mul(torch::Tensor &out, torch::Tensor &input) {
  int num_tokens = input.numel() / input.size(-1);
  int d = input.size(-1) / 2;
  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "silu_and_mul_impl", [&] {
        CPU_KERNEL_GUARD_IN(silu_and_mul_impl)
        activation_kernel<scalar_t, silu_act, true>(num_tokens, d,
                                                    input.data_ptr<scalar_t>(),
                                                    out.data_ptr<scalar_t>());
        CPU_KERNEL_GUARD_OUT(silu_and_mul_impl)
      });
 }
 void gelu_and_mul(torch::Tensor &out,   // [..., d]
                      torch::Tensor &input) // [..., 2 * d]
 {
  int num_tokens = input.numel() / input.size(-1);
  int d = input.size(-1) / 2;
  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "gelu_and_mul_impl", [&] {
        CPU_KERNEL_GUARD_IN(gelu_and_mul_impl)
        activation_kernel<scalar_t, gelu_act, true>(num_tokens, d,
                                                    input.data_ptr<scalar_t>(),
                                                    out.data_ptr<scalar_t>());
        CPU_KERNEL_GUARD_OUT(gelu_and_mul_impl)
      });
 }
 void gelu_tanh_and_mul(torch::Tensor &out,   // [..., d]
                           torch::Tensor &input) // [..., 2 * d]
 {
  int num_tokens = input.numel() / input.size(-1);
  int d = input.size(-1) / 2;
  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "gelu_tanh_and_mul_impl", [&] {
        CPU_KERNEL_GUARD_IN(gelu_tanh_and_mul_impl)
        activation_kernel<scalar_t, gelu_tanh_act, true>(
            num_tokens, d, input.data_ptr<scalar_t>(),
            out.data_ptr<scalar_t>());
        CPU_KERNEL_GUARD_OUT(gelu_tanh_and_mul_impl)
      });
 }
 void gelu_new(torch::Tensor &out, torch::Tensor &input) {
  int num_tokens = input.numel() / input.size(-1);
  int d = input.size(-1);
  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_new_impl", [&] {
    CPU_KERNEL_GUARD_IN(gelu_new_impl)
    activation_kernel<scalar_t, gelu_new_act, false>(
        num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
    CPU_KERNEL_GUARD_OUT(gelu_new_impl)
  });
 }
 void gelu_fast(torch::Tensor &out, torch::Tensor &input) {
  int num_tokens = input.numel() / input.size(-1);
  int d = input.size(-1);
  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_fast_impl", [&] {
    CPU_KERNEL_GUARD_IN(gelu_fast_impl)
    activation_kernel<scalar_t, gelu_fast_act, false>(
        num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
    CPU_KERNEL_GUARD_OUT(gelu_fast_impl)
  });
 }
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -0,0 +1,746 @@
 #include "cpu_types.hpp"
 namespace {
 template <typename scalar_t> struct KernelVecType {
  using q_load_vec_type = void;
  using q_vec_type = void;
  using k_load_vec_type = void;
  using k_vec_type = void;
  using qk_acc_vec_type = void;
  using v_load_vec_type = void;
 };
 template <> struct KernelVecType<float> {
  using q_load_vec_type = vec_op::FP32Vec4;
  using q_vec_type = vec_op::FP32Vec16;
  using k_load_vec_type = vec_op::FP32Vec16;
  using k_vec_type = vec_op::FP32Vec16;
  using qk_acc_vec_type = vec_op::FP32Vec16;
  using v_load_vec_type = vec_op::FP32Vec16;
 };
 #ifdef __AVX512BF16__
 template <> struct KernelVecType<c10::BFloat16> {
  using q_load_vec_type = vec_op::BF16Vec8;
  using q_vec_type = vec_op::BF16Vec32;
  using k_load_vec_type = vec_op::BF16Vec32;
  using k_vec_type = vec_op::BF16Vec32;
  using qk_acc_vec_type = vec_op::FP32Vec16;
  using v_load_vec_type = vec_op::BF16Vec16;
 };
 #else
 template <> struct KernelVecType<c10::BFloat16> {
  using q_load_vec_type = vec_op::BF16Vec8;
  using q_vec_type = vec_op::FP32Vec16;
  using k_load_vec_type = vec_op::BF16Vec16;
  using k_vec_type = vec_op::FP32Vec16;
  using qk_acc_vec_type = vec_op::FP32Vec16;
  using v_load_vec_type = vec_op::BF16Vec16;
 };
 #endif
 template <typename T>
 FORCE_INLINE std::pair<T, T> reduceSoftmax(T *data, const int size,
                                           const int capacity) {
  T max = data[0];
  for (int i = 1; i < size; ++i) {
    max = max >= data[i] ? max : data[i];
  }
  T sum = 0;
  for (int i = 0; i < size; ++i) {
    data[i] = std::exp(data[i] - max);
    sum += data[i];
  }
  int i = 0;
  for (; i < size; ++i) {
    data[i] /= sum;
  }
  for (; i < capacity; ++i) {
    data[i] = 0;
  }
  return {max, sum};
 }
 template <typename T>
 FORCE_INLINE std::pair<T, T>
 reduceSoftmaxAlibi(T *data, const int size, const int capacity,
                   const float alibi_slope, const int start_index,
                   const int seq_len) {
  data[0] += alibi_slope * (start_index - seq_len + 1);
  T max = data[0];
  for (int i = 1; i < size; ++i) {
    T qk = data[i] + alibi_slope * (start_index + i - seq_len + 1);
    data[i] = qk;
    max = max >= qk ? max : qk;
  }
  T sum = 0;
  for (int i = 0; i < size; ++i) {
    data[i] = std::exp(data[i] - max);
    sum += data[i];
  }
  int i = 0;
  for (; i < size; ++i) {
    data[i] /= sum;
  }
  for (; i < capacity; ++i) {
    data[i] = 0;
  }
  return {max, sum};
 }
 template <typename T>
 FORCE_INLINE void reducePartitonSoftmax(const T *max_data, T *sum_data,
                                        const int size) {
  T max = max_data[0];
  for (int i = 1; i < size; ++i) {
    max = max >= max_data[i] ? max : max_data[i];
  }
  T rescaled_sum = 0;
  for (int i = 0; i < size; ++i) {
    T rescale_factor = std::exp(max_data[i] - max);
    rescaled_sum += rescale_factor * sum_data[i];
    sum_data[i] *= rescale_factor;
  }
  for (int i = 0; i < size; ++i) {
    sum_data[i] /= rescaled_sum + 1e-8;
  }
 }
 template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int x>
 struct reduceQKBlockKernel {
  using q_load_vec_type = typename KernelVecType<scalar_t>::q_load_vec_type;
  using q_vec_type = typename KernelVecType<scalar_t>::q_vec_type;
  using k_load_vec_type = typename KernelVecType<scalar_t>::k_load_vec_type;
  using k_vec_type = typename KernelVecType<scalar_t>::k_vec_type;
  using qk_acc_vec_type = typename KernelVecType<scalar_t>::qk_acc_vec_type;
  constexpr static int TOKEN_PER_GROUP = k_load_vec_type::get_elem_num() / x;
  constexpr static int MAX_GROUP_NUM = 16 / TOKEN_PER_GROUP;
  constexpr static int UNROLL_GROUP_NUM = MAX_GROUP_NUM / 4;
  static_assert(MAX_GROUP_NUM == 8 || MAX_GROUP_NUM == 4);
  static_assert(k_load_vec_type::get_elem_num() % x == 0);
  static_assert(q_load_vec_type::get_elem_num() * sizeof(scalar_t) == 16);
  FORCE_INLINE static void call(const scalar_t *__restrict__ q,
                                const scalar_t *__restrict__ k_block,
                                float *__restrict__ logits, float scale,
                                const int token_num) {
    const int group_num = (token_num + TOKEN_PER_GROUP - 1) / TOKEN_PER_GROUP;
    qk_acc_vec_type group_accums[MAX_GROUP_NUM];
    if (token_num == BLOCK_SIZE) {
      for (int q_offset = 0; q_offset < HEAD_SIZE;
           q_offset += x, k_block += x * BLOCK_SIZE) {
        q_load_vec_type q_load_group_vec(q + q_offset);
        q_vec_type q_group_vec(q_load_group_vec);
        vec_op::unroll_loop<int, MAX_GROUP_NUM>(
            [k_block, &q_group_vec, &group_accums](int token_group_idx) {
              k_load_vec_type k_load_group_vec(k_block + token_group_idx * x *
                                                             TOKEN_PER_GROUP);
              k_vec_type k_group_vec(k_load_group_vec);
              vec_op::fma(group_accums[token_group_idx], q_group_vec,
                          k_group_vec);
              vec_op::prefetch(k_block + x * BLOCK_SIZE +
                               token_group_idx * x * TOKEN_PER_GROUP);
            });
      }
    } else {
      for (int q_offset = 0; q_offset < HEAD_SIZE;
           q_offset += x, k_block += x * BLOCK_SIZE) {
        q_load_vec_type q_load_group_vec(q + q_offset);
        q_vec_type q_group_vec(q_load_group_vec);
        for (int token_group_start = 0; token_group_start < group_num;
             token_group_start += UNROLL_GROUP_NUM) {
          vec_op::unroll_loop<int, UNROLL_GROUP_NUM>(
              [token_group_start, k_block, &q_group_vec,
               &group_accums](int token_group_idx) {
                token_group_idx += token_group_start;
                k_load_vec_type k_load_group_vec(k_block + token_group_idx * x *
                                                               TOKEN_PER_GROUP);
                k_vec_type k_group_vec(k_load_group_vec);
                vec_op::fma(group_accums[token_group_idx], q_group_vec,
                            k_group_vec);
                vec_op::prefetch(k_block + x * BLOCK_SIZE +
                                 token_group_idx * x * TOKEN_PER_GROUP);
              });
        }
      }
    }
    for (int token_group_idx = 0; token_group_idx < group_num;
         ++token_group_idx) {
      vec_op::unroll_loop<int, TOKEN_PER_GROUP>(
          [&group_accums, logits, scale, token_group_idx](int token_idx) {
            float dot_v =
                group_accums[token_group_idx]
                    .template reduce_sub_sum<qk_acc_vec_type::get_elem_num() /
                                             TOKEN_PER_GROUP>(token_idx);
            logits[token_group_idx * TOKEN_PER_GROUP + token_idx] =
                dot_v * scale;
          });
    }
  }
 };
 template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE,
          int HEAD_PARTITION_SIZE, typename acc_t>
 FORCE_INLINE void reduceValueBlock(const float *prob, const scalar_t *v_block,
                                   acc_t &&acc) {
  using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
  constexpr int ELEM_NUM = v_load_vec_type::get_elem_num();
  static_assert(BLOCK_SIZE == ELEM_NUM);
  vec_op::FP32Vec16 prob_vec(prob);
  vec_op::unroll_loop<int, HEAD_PARTITION_SIZE>([&](int head_elem_idx) {
    v_load_vec_type v_vec(v_block + BLOCK_SIZE * head_elem_idx);
    vec_op::FP32Vec16 fp32_v_vec(v_vec);
    acc[head_elem_idx] = acc[head_elem_idx] + prob_vec * fp32_v_vec;
  });
 }
 }; // namespace
 // Paged attention v1
 namespace {
 template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE>
 struct paged_attention_v1_impl {
  static void
  call(scalar_t *__restrict__ out,           // [num_seqs, num_heads, head_size]
       const scalar_t *__restrict__ q,       // [num_seqs, num_heads, head_size]
       const scalar_t *__restrict__ k_cache, // [num_blocks, num_kv_heads,
                                             // head_size/x, block_size, x]
       const scalar_t *__restrict__ v_cache, // [num_blocks, num_kv_heads,
                                             // head_size, block_size]
       const int num_kv_heads, const float scale,
       const int
           *__restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
       const int *__restrict__ seq_lens, // [num_seqs]
       const int max_num_blocks_per_seq,
       const float *__restrict__ alibi_slopes, // [num_heads]
       const int q_stride, const int kv_block_stride, const int kv_head_stride,
       const int num_seqs, const int num_heads) {
    constexpr int x = 16 / sizeof(scalar_t);
    const int num_queries_per_kv = num_heads / num_kv_heads;
    static_assert(BLOCK_SIZE == 16);
    int max_seq_len = max_num_blocks_per_seq * BLOCK_SIZE;
    int max_seq_len_padded = (max_seq_len + 15) & 0xFFFFFFF0;
    TORCH_CHECK((max_seq_len_padded * sizeof(float)) % 64 == 0);
    const int parallel_work_item_num = omp_get_max_threads();
    size_t logits_bytes =
        parallel_work_item_num * max_seq_len_padded * sizeof(float);
    float *logits = (float *)std::aligned_alloc(
        64, logits_bytes); // Cacheline alignment for each context token.
                           // [parallel_work_item_num, max_seq_len_padded]
 #pragma omp parallel for collapse(2) schedule(dynamic, 1)
    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
        int seq_len = seq_lens[seq_idx];
        const int *seq_block_table =
            block_tables + max_num_blocks_per_seq * seq_idx;
        const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
        const int64_t kv_head_idx = head_idx / num_queries_per_kv;
        const scalar_t *__restrict__ q_vec_ptr =
            q + seq_idx * q_stride + head_idx * HEAD_SIZE;
        const int last_block_token_num =
            seq_len - (block_num - 1) * BLOCK_SIZE;
        float *__restrict__ thread_block_logits =
            logits + omp_get_thread_num() * max_seq_len_padded;
        // Compute logits
        for (int block_idx = 0; block_idx < block_num; ++block_idx) {
          const int64_t physical_block_idx = seq_block_table[block_idx];
          const scalar_t *__restrict__ k_block_cache_ptr =
              k_cache + physical_block_idx * kv_block_stride +
              kv_head_idx * kv_head_stride;
          float *__restrict__ head_block_logits =
              thread_block_logits + block_idx * BLOCK_SIZE;
          reduceQKBlockKernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, x>::call(
              q_vec_ptr, k_block_cache_ptr, head_block_logits, scale,
              block_idx == block_num - 1 ? last_block_token_num : BLOCK_SIZE);
        }
        // Compute softmax
        if (alibi_slopes) {
          reduceSoftmaxAlibi(thread_block_logits, seq_len,
                             block_num * BLOCK_SIZE, alibi_slopes[head_idx], 0,
                             seq_len);
        } else {
          reduceSoftmax(thread_block_logits, seq_len,
                        block_num * BLOCK_SIZE);
        }
        // Compute value
        constexpr int head_elem_num_per_partition = 16;
        constexpr int head_partition_num =
            HEAD_SIZE / head_elem_num_per_partition;
        for (int head_part_idx = 0; head_part_idx < head_partition_num;
             ++head_part_idx) {
          vec_op::FP32Vec16 accums[head_elem_num_per_partition];
          scalar_t *__restrict__ out_ptr =
              out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
              head_part_idx * head_elem_num_per_partition;
          for (int block_idx = 0; block_idx < block_num; ++block_idx) {
            const int64_t physical_block_idx = seq_block_table[block_idx];
            const float *__restrict__ prob_vec_ptr =
                thread_block_logits + block_idx * BLOCK_SIZE;
            const scalar_t *__restrict__ v_block_cache_ptr =
                v_cache + physical_block_idx * kv_block_stride +
                kv_head_idx * kv_head_stride +
                BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
            reduceValueBlock<scalar_t, HEAD_SIZE, BLOCK_SIZE,
                             head_elem_num_per_partition>(
                prob_vec_ptr, v_block_cache_ptr, accums);
            if (block_idx != block_num - 1) {
              const int64_t next_physical_block_idx =
                  seq_block_table[block_idx + 1];
              const scalar_t *__restrict__ next_v_block_cache_ptr =
                  v_cache + next_physical_block_idx * kv_block_stride +
                  kv_head_idx * kv_head_stride +
                  BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
              vec_op::unroll_loop<int, head_elem_num_per_partition>(
                  [&](int head_elem_idx) {
                    if (head_elem_idx % 2 == 0) {
                      vec_op::prefetch(next_v_block_cache_ptr +
                                       BLOCK_SIZE * head_elem_idx);
                    }
                  });
            }
          }
          vec_op::unroll_loop<int, head_elem_num_per_partition>(
              [&](int head_elem_idx) {
                float value = accums[head_elem_idx].reduce_sum();
                vec_op::storeFP32(value, out_ptr + head_elem_idx);
              });
        }
      }
    }
    std::free(logits);
  }
 };
 #define LAUNCH_V1_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE)                   \
  paged_attention_v1_impl<T, HEAD_SIZE, BLOCK_SIZE>::call(                     \
      out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
      block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq,              \
      alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, num_seqs,   \
      num_heads);
 template <typename T, int BLOCK_SIZE>
 void paged_attention_v1_impl_launcher(
    torch::Tensor &out, torch::Tensor &query, torch::Tensor &key_cache,
    torch::Tensor &value_cache, int num_kv_heads, float scale,
    torch::Tensor &block_tables, torch::Tensor &seq_lens,
    int max_seq_len, const c10::optional<torch::Tensor> &alibi_slopes) {
  int num_seqs = query.size(0);
  int num_heads = query.size(1);
  int head_size = query.size(2);
  int max_num_blocks_per_seq = block_tables.size(1);
  int q_stride = query.stride(0);
  int kv_block_stride = key_cache.stride(0);
  int kv_head_stride = key_cache.stride(1);
  // NOTE: alibi_slopes is optional.
  const float *alibi_slopes_ptr =
      alibi_slopes
          ? reinterpret_cast<const float *>(alibi_slopes.value().data_ptr())
          : nullptr;
  T *out_ptr = reinterpret_cast<T *>(out.data_ptr());
  T *query_ptr = reinterpret_cast<T *>(query.data_ptr());
  T *key_cache_ptr = reinterpret_cast<T *>(key_cache.data_ptr());
  T *value_cache_ptr = reinterpret_cast<T *>(value_cache.data_ptr());
  int *block_tables_ptr = block_tables.data_ptr<int>();
  int *seq_lens_ptr = seq_lens.data_ptr<int>();
  switch (head_size) {
  case 64:
    LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
    break;
  case 80:
    LAUNCH_V1_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
    break;
  case 96:
    LAUNCH_V1_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
    break;
  case 112:
    LAUNCH_V1_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
    break;
  case 128:
    LAUNCH_V1_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
    break;
  case 256:
    LAUNCH_V1_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
    break;
  default:
    TORCH_CHECK(false, "Unsupported head size: ", head_size);
    break;
  }
 }
 #define CALL_V1_KERNEL_LAUNCHER(T, BLOCK_SIZE)                                 \
  paged_attention_v1_impl_launcher<T, BLOCK_SIZE>(                             \
      out, query, key_cache, value_cache, num_kv_heads, scale, block_tables,   \
      seq_lens, max_seq_len, alibi_slopes);
 #define CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(T)                                  \
  switch (block_size) {                                                        \
  case 16:                                                                     \
    CALL_V1_KERNEL_LAUNCHER(T, 16);                                            \
    break;                                                                     \
  default:                                                                     \
    TORCH_CHECK(false, "Unsupported block size: ", block_size);                \
    break;                                                                     \
  }
 } // namespace
 void paged_attention_v1(torch::Tensor &out, torch::Tensor &query,
                        torch::Tensor &key_cache, torch::Tensor &value_cache,
                        int num_kv_heads, float scale,
                        torch::Tensor &block_tables,
                        torch::Tensor &seq_lens, int block_size,
                        int max_seq_len,
                        const c10::optional<torch::Tensor> &alibi_slopes,
                        const std::string &kv_cache_dtype, float kv_scale) {
  TORCH_CHECK(kv_scale == 1.0f);
  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v1_impl",
                               [&] {
                                 CPU_KERNEL_GUARD_IN(paged_attention_v1_impl)
                                 CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
                                 CPU_KERNEL_GUARD_OUT(paged_attention_v1_impl)
                               });
 }
 // Paged attention v2
 namespace {
 template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int PARTITION_SIZE>
 struct paged_attention_v2_impl {
  static void call(
      scalar_t *__restrict__ out,   // [num_seqs, num_heads, head_size]
      float *__restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
      float
          *__restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
      scalar_t *__restrict__ tmp_out,       // [num_seqs, num_heads,
                                            // max_num_partitions, head_size]
      const scalar_t *__restrict__ q,       // [num_seqs, num_heads, head_size]
      const scalar_t *__restrict__ k_cache, // [num_blocks, num_kv_heads,
                                            // head_size/x, block_size, x]
      const scalar_t *__restrict__ v_cache, // [num_blocks, num_kv_heads,
                                            // head_size, block_size]
      const int num_kv_heads, const float scale,
      const int
          *__restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
      const int *__restrict__ seq_lens, // [num_seqs]
      const int max_num_blocks_per_seq,
      const float *__restrict__ alibi_slopes, // [num_heads]
      const int q_stride, const int kv_block_stride, const int kv_head_stride,
      const int num_seqs, const int num_heads, const int max_num_partitions) {
    constexpr int x = 16 / sizeof(scalar_t);
    const int num_queries_per_kv = num_heads / num_kv_heads;
    static_assert(BLOCK_SIZE == 16);
    static_assert(PARTITION_SIZE * sizeof(float) % 64 == 0);
    static_assert(PARTITION_SIZE % BLOCK_SIZE == 0);
 #pragma omp parallel for collapse(3) schedule(static, 1)
    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
      for (int partition_idx = 0; partition_idx < max_num_partitions;
           ++partition_idx) {
        for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
          const int seq_len = seq_lens[seq_idx];
          const int start_token_idx = partition_idx * PARTITION_SIZE;
          if (start_token_idx >= seq_len)
            continue;
          const int partition_num =
              (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
          const bool no_reduce = (partition_num == 1);
          const int token_num =
              (std::min(seq_len, start_token_idx + PARTITION_SIZE) -
               start_token_idx);
          const int block_num =
              (token_num + BLOCK_SIZE - 1) / BLOCK_SIZE;
          const int last_block_token_num =
              token_num - (block_num - 1) * BLOCK_SIZE;
          const int *seq_block_table = block_tables +
                                       max_num_blocks_per_seq * seq_idx +
                                       start_token_idx / BLOCK_SIZE;
          const int64_t kv_head_idx = head_idx / num_queries_per_kv;
          const scalar_t *__restrict__ q_vec_ptr =
              q + seq_idx * q_stride + head_idx * HEAD_SIZE;
          float logits[PARTITION_SIZE] __attribute__((aligned(64))) = {0};
          // Compute logits
          for (int block_idx = 0; block_idx < block_num; ++block_idx) {
            const int64_t physical_block_idx = seq_block_table[block_idx];
            const scalar_t *__restrict__ k_block_cache_ptr =
                k_cache + physical_block_idx * kv_block_stride +
                kv_head_idx * kv_head_stride;
            float *__restrict__ head_block_logits =
                logits + block_idx * BLOCK_SIZE;
            reduceQKBlockKernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, x>::call(
                q_vec_ptr, k_block_cache_ptr, head_block_logits, scale,
                block_idx == block_num - 1 ? last_block_token_num : BLOCK_SIZE);
          }
          std::pair<float, float> max_and_sum;
          if (alibi_slopes) {
            max_and_sum = reduceSoftmaxAlibi(
                logits, token_num, block_num * BLOCK_SIZE,
                alibi_slopes[head_idx], start_token_idx, seq_len);
          } else {
            max_and_sum = reduceSoftmax(logits, token_num,
                                        block_num * BLOCK_SIZE);
          }
          auto &&[max_logit, exp_sum] = max_and_sum;
          scalar_t *__restrict__ output_buffer = nullptr;
          if (!no_reduce) {
            auto idx = seq_idx * num_heads * max_num_partitions +
                       head_idx * max_num_partitions + partition_idx;
            max_logits[idx] = max_logit;
            exp_sums[idx] = exp_sum;
            output_buffer =
                tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
                head_idx * max_num_partitions * HEAD_SIZE +
                partition_idx * HEAD_SIZE;
          } else {
            output_buffer =
                out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
          }
          // Compute value
          constexpr int head_elem_num_per_partition = 16;
          constexpr int head_partition_num =
              HEAD_SIZE / head_elem_num_per_partition;
          for (int head_part_idx = 0; head_part_idx < head_partition_num;
               ++head_part_idx) {
            vec_op::FP32Vec16 accums[head_elem_num_per_partition];
            scalar_t *__restrict__ out_ptr =
                output_buffer + head_part_idx * head_elem_num_per_partition;
            for (int block_idx = 0; block_idx < block_num; ++block_idx) {
              const int64_t physical_block_idx = seq_block_table[block_idx];
              const float *__restrict__ prob_vec_ptr =
                  logits + block_idx * BLOCK_SIZE;
              const scalar_t *__restrict__ v_block_cache_ptr =
                  v_cache + physical_block_idx * kv_block_stride +
                  kv_head_idx * kv_head_stride +
                  BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
              reduceValueBlock<scalar_t, HEAD_SIZE, BLOCK_SIZE,
                               head_elem_num_per_partition>(
                  prob_vec_ptr, v_block_cache_ptr, accums);
              if (block_idx != block_num - 1) {
                const int64_t next_physical_block_idx =
                    seq_block_table[block_idx + 1];
                const scalar_t *__restrict__ next_v_block_cache_ptr =
                    v_cache + next_physical_block_idx * kv_block_stride +
                    kv_head_idx * kv_head_stride +
                    BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
                vec_op::unroll_loop<int, head_elem_num_per_partition>(
                    [&](int head_elem_idx) {
                      if (head_elem_idx % 2 == 0) {
                        vec_op::prefetch(next_v_block_cache_ptr +
                                         BLOCK_SIZE * head_elem_idx);
                      }
                    });
              }
            }
            vec_op::unroll_loop<int, head_elem_num_per_partition>(
                [&](int head_elem_idx) {
                  float value = accums[head_elem_idx].reduce_sum();
                  vec_op::storeFP32(value, out_ptr + head_elem_idx);
                });
          }
        }
      }
    }
    // Rescale partition softmax and store the factors to exp_sums
 #pragma omp parallel for collapse(2) schedule(static, 1)
    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
        const int seq_len = seq_lens[seq_idx];
        const int partition_num =
            (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
        if (partition_num == 1)
          continue;
        reducePartitonSoftmax(
            max_logits + seq_idx * num_heads * max_num_partitions +
                head_idx * max_num_partitions,
            exp_sums + seq_idx * num_heads * max_num_partitions +
                head_idx * max_num_partitions,
            partition_num);
      }
    }
    // Reduce values
    using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
    static_assert(v_load_vec_type::get_elem_num() == BLOCK_SIZE);
    constexpr int head_elem_num_per_group =
        16; // Note: didn't align with the cacheline size, due to some HEAD_SIZE
            // didn't align with 64 bytes
    static_assert(HEAD_SIZE % head_elem_num_per_group == 0);
    constexpr int head_group_num = HEAD_SIZE / head_elem_num_per_group;
    const float *__restrict__ rescale_factors = exp_sums;
 #pragma omp parallel for collapse(3) schedule(static, 1)
    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
        for (int group_idx = 0; group_idx < head_group_num; ++group_idx) {
          const int seq_len = seq_lens[seq_idx];
          const int partition_num =
              (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
          if (partition_num == 1)
            continue;
          const float *__restrict__ seq_head_rescale_factors =
              rescale_factors + seq_idx * num_heads * max_num_partitions +
              head_idx * max_num_partitions;
          const scalar_t *__restrict__ seq_head_tmp_out =
              tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
              head_idx * max_num_partitions * HEAD_SIZE +
              group_idx * head_elem_num_per_group;
          scalar_t *__restrict__ seq_head_output =
              out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
              group_idx * head_elem_num_per_group;
          vec_op::FP32Vec16 acc;
          for (int i = 0; i < partition_num; ++i) {
            vec_op::FP32Vec16 rescale_factor(seq_head_rescale_factors[i]);
            v_load_vec_type value(seq_head_tmp_out + i * HEAD_SIZE);
            vec_op::FP32Vec16 fp32_value(value);
            acc = acc + fp32_value * rescale_factor;
          }
          v_load_vec_type cast_acc(acc);
          cast_acc.save(seq_head_output);
        }
      }
    }
  }
 };
 #define LAUNCH_V2_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE)                   \
  paged_attention_v2_impl<T, HEAD_SIZE, BLOCK_SIZE, PARTITION_SIZE>::call(     \
      out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr,           \
      key_cache_ptr, value_cache_ptr, num_kv_heads, scale, block_tables_ptr,   \
      seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride,    \
      kv_block_stride, kv_head_stride, num_seqs, num_heads,                    \
      max_num_partitions);
 template <typename T, int BLOCK_SIZE, int PARTITION_SIZE = 512>
 void paged_attention_v2_impl_launcher(
    torch::Tensor &out, torch::Tensor &exp_sums, torch::Tensor &max_logits,
    torch::Tensor &tmp_out, torch::Tensor &query, torch::Tensor &key_cache,
    torch::Tensor &value_cache, int num_kv_heads, float scale,
    torch::Tensor &block_tables, torch::Tensor &seq_lens, int block_size,
    int max_seq_len, const c10::optional<torch::Tensor> &alibi_slopes) {
  int num_seqs = query.size(0);
  int num_heads = query.size(1);
  int head_size = query.size(2);
  int max_num_blocks_per_seq = block_tables.size(1);
  int q_stride = query.stride(0);
  int kv_block_stride = key_cache.stride(0);
  int kv_head_stride = key_cache.stride(1);
  int max_num_partitions = exp_sums.size(-1);
  // NOTE: alibi_slopes is optional.
  const float *alibi_slopes_ptr =
      alibi_slopes
          ? reinterpret_cast<const float *>(alibi_slopes.value().data_ptr())
          : nullptr;
  T *out_ptr = reinterpret_cast<T *>(out.data_ptr());
  float *exp_sums_ptr = reinterpret_cast<float *>(exp_sums.data_ptr());
  float *max_logits_ptr = reinterpret_cast<float *>(max_logits.data_ptr());
  T *tmp_out_ptr = reinterpret_cast<T *>(tmp_out.data_ptr());
  T *query_ptr = reinterpret_cast<T *>(query.data_ptr());
  T *key_cache_ptr = reinterpret_cast<T *>(key_cache.data_ptr());
  T *value_cache_ptr = reinterpret_cast<T *>(value_cache.data_ptr());
  int *block_tables_ptr = block_tables.data_ptr<int>();
  int *seq_lens_ptr = seq_lens.data_ptr<int>();
  switch (head_size) {
  case 64:
    LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
    break;
  case 80:
    LAUNCH_V2_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
    break;
  case 96:
    LAUNCH_V2_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
    break;
  case 112:
    LAUNCH_V2_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
    break;
  case 128:
    LAUNCH_V2_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
    break;
  case 256:
    LAUNCH_V2_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
    break;
  default:
    TORCH_CHECK(false, "Unsupported head size: ", head_size);
    break;
  }
 }
 #define CALL_V2_KERNEL_LAUNCHER(T, BLOCK_SIZE)                                 \
  paged_attention_v2_impl_launcher<T, BLOCK_SIZE>(                             \
      out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,       \
      num_kv_heads, scale, block_tables, seq_lens, block_size,             \
      max_seq_len, alibi_slopes);
 #define CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(T)                                  \
  switch (block_size) {                                                        \
  case 16:                                                                     \
    CALL_V2_KERNEL_LAUNCHER(T, 16);                                            \
    break;                                                                     \
  default:                                                                     \
    TORCH_CHECK(false, "Unsupported block size: ", block_size);                \
    break;                                                                     \
  }
 } // namespace
 void paged_attention_v2(torch::Tensor &out, torch::Tensor &exp_sums,
                        torch::Tensor &max_logits, torch::Tensor &tmp_out,
                        torch::Tensor &query, torch::Tensor &key_cache,
                        torch::Tensor &value_cache, int num_kv_heads,
                        float scale, torch::Tensor &block_tables,
                        torch::Tensor &seq_lens, int block_size,
                        int max_seq_len,
                        const c10::optional<torch::Tensor> &alibi_slopes,
                        const std::string &kv_cache_dtype, float kv_scale) {
  TORCH_CHECK(kv_scale == 1.0f);
  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v2_impl",
                               [&] {
                                 CPU_KERNEL_GUARD_IN(paged_attention_v2_impl)
                                 CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
                                 CPU_KERNEL_GUARD_OUT(paged_attention_v2_impl)
                               });
 }
--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
@@ -0,0 +1,141 @@
 #include <map>
 #include <vector>
 #include "cpu_types.hpp"
 namespace {
 template <typename scalar_t>
 void copy_blocks_cpu_impl(
    std::vector<torch::Tensor> &key_caches,
    std::vector<torch::Tensor> &value_caches,
    const std::vector<std::pair<int64_t, int64_t>> mapping_pairs,
    const int element_num_per_block, const int layer_num) {
  const size_t pair_num = mapping_pairs.size();
  const size_t block_bytes = sizeof(scalar_t) * element_num_per_block;
 #pragma omp parallel for collapse(2)
  for (int layer = 0; layer < layer_num; ++layer) {
    for (size_t pair = 0; pair < pair_num; ++pair) {
      int64_t source_offset = element_num_per_block * mapping_pairs[pair].first;
      int64_t target_offset =
          element_num_per_block * mapping_pairs[pair].second;
      scalar_t *key_cache_ptr = key_caches[layer].data_ptr<scalar_t>();
      scalar_t *source_ptr = key_cache_ptr + source_offset;
      scalar_t *target_ptr = key_cache_ptr + target_offset;
      std::memcpy(target_ptr, source_ptr, block_bytes);
      scalar_t *value_cache_ptr = value_caches[layer].data_ptr<scalar_t>();
      source_ptr = value_cache_ptr + source_offset;
      target_ptr = value_cache_ptr + target_offset;
      std::memcpy(target_ptr, source_ptr, block_bytes);
    }
  }
 }
 template <typename scalar_t>
 void reshape_and_cache_cpu_impl(
    const scalar_t *__restrict__ key, const scalar_t *__restrict__ value,
    scalar_t *__restrict__ key_cache, scalar_t *__restrict__ value_cache,
    const int64_t *__restrict__ slot_mapping, const int num_tokens,
    const int key_stride, const int value_stride, const int num_heads,
    const int head_size, const int block_size, const int x) {
  const int block_elem_num = num_heads * head_size * block_size;
 #pragma omp parallel for collapse(2)
  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
    for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
      const int64_t slot_idx = slot_mapping[token_idx];
      if (slot_idx >= 0) {
        int src_key_head_idx = token_idx * key_stride + head_idx * head_size;
        int src_value_head_idx =
            token_idx * value_stride + head_idx * head_size;
        const scalar_t *src_key_head_ptr = key + src_key_head_idx;
        const scalar_t *src_value_head_ptr = value + src_value_head_idx;
        const int64_t block_index = slot_idx / block_size;
        const int64_t block_offset = slot_idx % block_size;
        scalar_t *target_key_head_ptr = key_cache +
                                        block_elem_num * block_index +
                                        head_idx * block_size * head_size;
        scalar_t *target_value_head_ptr = value_cache +
                                          block_elem_num * block_index +
                                          head_idx * block_size * head_size;
        for (int src_key_idx = 0; src_key_idx < head_size; src_key_idx += x) {
          const int64_t target_offset =
              src_key_idx * block_size + block_offset * x;
          for (int i = 0; i < x; ++i) {
            target_key_head_ptr[target_offset + i] =
                src_key_head_ptr[src_key_idx + i];
          }
        }
        for (int src_value_idx = 0; src_value_idx < head_size;
             ++src_value_idx) {
          const int64_t target_offset =
              src_value_idx * block_size + block_offset;
          target_value_head_ptr[target_offset] =
              src_value_head_ptr[src_value_idx];
        }
      }
    }
  }
 }
 }; // namespace
 void copy_blocks(std::vector<torch::Tensor> &key_caches,
                 std::vector<torch::Tensor> &value_caches,
                 const std::map<int64_t, std::vector<int64_t>> &block_mapping) {
  int num_layers = key_caches.size();
  TORCH_CHECK(num_layers == value_caches.size());
  if (num_layers == 0) {
    return;
  }
  std::vector<std::pair<int64_t, int64_t>> mapping_pairs;
  mapping_pairs.reserve(block_mapping.size());
  for (const auto &pair : block_mapping) {
    for (const auto &dst : pair.second) {
      mapping_pairs.emplace_back(pair.first, dst);
    }
  }
  const int element_num_per_block = key_caches[0][0].numel();
  VLLM_DISPATCH_FLOATING_TYPES(
      key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
        CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
        copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, mapping_pairs,
                                       element_num_per_block, num_layers);
        CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
      });
 }
 void reshape_and_cache(torch::Tensor &key, torch::Tensor &value,
                       torch::Tensor &key_cache, torch::Tensor &value_cache,
                       torch::Tensor &slot_mapping,
                       const std::string &kv_cache_dtype, float kv_scale) {
  TORCH_CHECK(kv_scale == 1.0f);
  int num_tokens = key.size(0);
  int num_heads = key.size(1);
  int head_size = key.size(2);
  int block_size = key_cache.size(3);
  int x = key_cache.size(4);
  int key_stride = key.stride(0);
  int value_stride = value.stride(0);
  VLLM_DISPATCH_FLOATING_TYPES(
      key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
        CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl)
        reshape_and_cache_cpu_impl<scalar_t>(
            key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
            key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
            slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride,
            value_stride, num_heads, head_size, block_size, x);
        CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl)
      });
 }
 void swap_blocks(torch::Tensor &src, torch::Tensor &dst,
                 const std::map<int64_t, int64_t> &block_mapping) {
  TORCH_CHECK(false, "swap_blocks is unsupported on CPU.")
 }
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@@ -0,0 +1,352 @@
 #ifndef CPU_TYPES_HPP
 #define CPU_TYPES_HPP
 #include <immintrin.h>
 #include <torch/extension.h>
 namespace vec_op {
 // FIXME: FP16 is not fully supported in Torch-CPU
 #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 #ifndef CPU_OP_GUARD
 #define CPU_KERNEL_GUARD_IN(NAME)
 #define CPU_KERNEL_GUARD_OUT(NAME)
 #else
 #define CPU_KERNEL_GUARD_IN(NAME)                                              \
  std::cout << #NAME << " invoked." << std::endl;
 #define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
 #endif
 #define FORCE_INLINE __attribute__((always_inline)) inline
 namespace {
 template <typename T, T... indexes, typename F>
 constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
  (f(std::integral_constant<T, indexes>{}), ...);
 }
 }; // namespace
 template <typename T, T count, typename F,
          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
 constexpr void unroll_loop(F &&f) {
  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
 }
 template <typename T> struct Vec {
  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
 };
 struct FP32Vec8;
 struct FP32Vec16;
 #ifdef __AVX512FP16__
 struct FP16Vec8 : public Vec<FP16Vec8> {
  constexpr static int VEC_ELEM_NUM = 8;
  __m128h reg;
  explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {}
  explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {}
  explicit FP16Vec8(__m128h data) : reg(data) {}
  FP16Vec8 operator*(const FP16Vec8 &b) const {
    return FP16Vec8(_mm_mul_ph(reg, b.reg));
  }
  FP16Vec8 operator+(const FP16Vec8 &b) const {
    return FP16Vec8(_mm_add_ph(reg, b.reg));
  }
  FP16Vec8 operator-(const FP16Vec8 &b) const {
    return FP16Vec8(_mm_sub_ph(reg, b.reg));
  }
  FP16Vec8 operator/(const FP16Vec8 &b) const {
    return FP16Vec8(_mm_div_ph(reg, b.reg));
  }
  void save(void *ptr) const { _mm_storeu_ph(ptr, reg); }
 };
 #endif
 struct BF16Vec8 : public Vec<BF16Vec8> {
  constexpr static int VEC_ELEM_NUM = 8;
  __m128i reg;
  explicit BF16Vec8(const void *ptr)
      : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
  explicit BF16Vec8(const FP32Vec8 &);
  void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
 };
 struct BF16Vec16 : public Vec<BF16Vec16> {
  constexpr static int VEC_ELEM_NUM = 16;
  __m256i reg;
  explicit BF16Vec16(const void *ptr)
      : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
  explicit BF16Vec16(const FP32Vec16 &);
  void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
 };
 struct BF16Vec32 : public Vec<BF16Vec32> {
  constexpr static int VEC_ELEM_NUM = 32;
  __m512i reg;
  explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
  explicit BF16Vec32(__m512i data) : reg(data) {}
  explicit BF16Vec32(BF16Vec8 &vec8_data)
      : reg((__m512i)_mm512_inserti32x4(
            _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
                                                      (__m128i)vec8_data.reg),
                                                  (__m128i)vec8_data.reg, 1),
                               (__m128i)vec8_data.reg, 2),
            (__m128i)vec8_data.reg, 3)) {}
  void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; }
 };
 struct FP32Vec4 : public Vec<FP32Vec4> {
  constexpr static int VEC_ELEM_NUM = 4;
  union AliasReg {
    __m128 reg;
    float values[VEC_ELEM_NUM];
  };
  __m128 reg;
  explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {}
  explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {}
  explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {}
  explicit FP32Vec4(__m128 data) : reg(data) {}
  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
 };
 struct FP32Vec8 : public Vec<FP32Vec8> {
  constexpr static int VEC_ELEM_NUM = 8;
  union AliasReg {
    __m256 reg;
    float values[VEC_ELEM_NUM];
  };
  __m256 reg;
  explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {}
  explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {}
  explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {}
  explicit FP32Vec8(__m256 data) : reg(data) {}
  explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
 #ifdef __AVX512FP16__
  explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {}
 #endif
  explicit FP32Vec8(const BF16Vec8 &v)
      : reg(_mm256_castsi256_ps(
            _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {}
  float reduce_sum() const {
    AliasReg ar;
    ar.reg = reg;
    float result = 0;
    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
    return result;
  }
  FP32Vec8 exp() const {
    AliasReg ar;
    ar.reg = reg;
    return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]),
                                  expf(ar.values[5]), expf(ar.values[4]),
                                  expf(ar.values[3]), expf(ar.values[2]),
                                  expf(ar.values[1]), expf(ar.values[0])));
  }
  FP32Vec8 tanh() const {
    AliasReg ar;
    ar.reg = reg;
    return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]),
                                  tanhf(ar.values[5]), tanhf(ar.values[4]),
                                  tanhf(ar.values[3]), tanhf(ar.values[2]),
                                  tanhf(ar.values[1]), tanhf(ar.values[0])));
  }
  FP32Vec8 er() const {
    AliasReg ar;
    ar.reg = reg;
    return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]),
                                  erf(ar.values[5]), erf(ar.values[4]),
                                  erf(ar.values[3]), erf(ar.values[2]),
                                  erf(ar.values[1]), erf(ar.values[0])));
  }
  FP32Vec8 operator*(const FP32Vec8 &b) const {
    return FP32Vec8(_mm256_mul_ps(reg, b.reg));
  }
  FP32Vec8 operator+(const FP32Vec8 &b) const {
    return FP32Vec8(_mm256_add_ps(reg, b.reg));
  }
  FP32Vec8 operator-(const FP32Vec8 &b) const {
    return FP32Vec8(_mm256_sub_ps(reg, b.reg));
  }
  FP32Vec8 operator/(const FP32Vec8 &b) const {
    return FP32Vec8(_mm256_div_ps(reg, b.reg));
  }
  void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
 };
 struct FP32Vec16 : public Vec<FP32Vec16> {
  constexpr static int VEC_ELEM_NUM = 16;
  union AliasReg {
    __m512 reg;
    float values[VEC_ELEM_NUM];
  };
  __m512 reg;
  explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {}
  explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
  explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {}
  explicit FP32Vec16(__m512 data) : reg(data) {}
  explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
  explicit FP32Vec16(const FP32Vec4 &data)
      : reg((__m512)_mm512_inserti32x4(
            _mm512_inserti32x4(
                _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg),
                                   (__m128i)data.reg, 1),
                (__m128i)data.reg, 2),
            (__m128i)data.reg, 3)) {}
  explicit FP32Vec16(const FP32Vec8 &data)
      : reg((__m512)_mm512_inserti32x8(
            _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {}
  explicit FP32Vec16(const BF16Vec16 &v)
      : reg(_mm512_castsi512_ps(
            _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
  FP32Vec16 operator*(const FP32Vec16 &b) const {
    return FP32Vec16(_mm512_mul_ps(reg, b.reg));
  }
  FP32Vec16 operator+(const FP32Vec16 &b) const {
    return FP32Vec16(_mm512_add_ps(reg, b.reg));
  }
  FP32Vec16 operator-(const FP32Vec16 &b) const {
    return FP32Vec16(_mm512_sub_ps(reg, b.reg));
  }
  FP32Vec16 operator/(const FP32Vec16 &b) const {
    return FP32Vec16(_mm512_div_ps(reg, b.reg));
  }
  float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
  template <int group_size> float reduce_sub_sum(int idx) {
    static_assert(VEC_ELEM_NUM % group_size == 0);
    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
    __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
    return _mm512_mask_reduce_add_ps(mask, reg);
  }
  void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
 };
 template <typename T> struct VecType { using vec_type = void; };
 template <typename T> using vec_t = typename VecType<T>::vec_type;
 template <> struct VecType<float> { using vec_type = FP32Vec8; };
 #ifdef __AVX512FP16__
 template <> struct VecType<c10::Half> { using vec_type = FP16Vec16; };
 #endif
 template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
 template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
 #ifdef __AVX512FP16__
 template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
  *reinterpret_cast<_Float16 *>(ptr) = v;
 }
 #endif
 inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
  acc = acc + a * b;
 }
 #ifdef __AVX512BF16__
 template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
  *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
 }
 inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
    : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {}
 inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
    : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {}
 inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
  acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg);
 }
 #else
 template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
  c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
      reinterpret_cast<c10::BFloat16 *>(&v);
  *ptr = *(v_ptr + 1);
 }
 inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
    : reg(_mm256_cvtepi32_epi16(
          _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
 inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
    : reg(_mm512_cvtepi32_epi16(
          _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
 #endif
 inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); }
 }; // namespace vec_op
 #endif
--- a/csrc/cpu/layernorm.cpp
+++ b/csrc/cpu/layernorm.cpp
@@ -0,0 +1,117 @@
 #include "cpu_types.hpp"
 namespace {
 template <typename scalar_t>
 void rms_norm_impl(scalar_t *__restrict__ out,
                       const scalar_t *__restrict__ input,
                       const scalar_t *__restrict__ weight, const float epsilon,
                       const int num_tokens, const int hidden_size) {
  using scalar_vec_t = vec_op::vec_t<scalar_t>;
  constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
  TORCH_CHECK(hidden_size % VEC_ELEM_NUM == 0);
 #pragma omp parallel for
  for (int i = 0; i < num_tokens; ++i) {
    vec_op::FP32Vec8 variance(0.0);
    auto input_p = input + i * hidden_size;
    auto output_p = out + i * hidden_size;
    for (int j = 0; j < hidden_size; j += VEC_ELEM_NUM) {
      scalar_vec_t x(input_p + j);
      vec_op::FP32Vec8 fp32_x(x);
      variance = variance + fp32_x * fp32_x;
    }
    float s_variance =
        1.0f / sqrtf(variance.reduce_sum() / (float)hidden_size + epsilon);
    vec_op::FP32Vec8 fp32_s_variance(s_variance);
    for (int j = 0; j < hidden_size; j += VEC_ELEM_NUM) {
      scalar_vec_t x(input_p + j);
      scalar_vec_t w(weight + j);
      vec_op::FP32Vec8 fp32_x(x);
      vec_op::FP32Vec8 fp32_w(w);
      vec_op::FP32Vec8 fp32_out = fp32_x * fp32_s_variance * fp32_w;
      scalar_vec_t out(fp32_out);
      out.save(output_p + j);
    }
  }
 }
 template <typename scalar_t>
 void fused_add_rms_norm_impl(scalar_t *__restrict__ input,
                                 scalar_t *__restrict__ residual,
                                 const scalar_t *__restrict__ weight,
                                 const float epsilon, const int num_tokens,
                                 const int hidden_size) {
  using scalar_vec_t = vec_op::vec_t<scalar_t>;
  constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
  TORCH_CHECK(hidden_size % VEC_ELEM_NUM == 0);
 #pragma omp parallel for
  for (int i = 0; i < num_tokens; ++i) {
    vec_op::FP32Vec8 variance(0.0);
    auto input_p = input + i * hidden_size;
    auto residual_p = residual + i * hidden_size;
    for (int j = 0; j < hidden_size; j += VEC_ELEM_NUM) {
      scalar_vec_t x(input_p + j);
      scalar_vec_t res(residual_p + j);
      vec_op::FP32Vec8 fp32_x(x);
      vec_op::FP32Vec8 fp32_res(res);
      fp32_x = fp32_x + fp32_res;
      variance = variance + fp32_x * fp32_x;
      scalar_vec_t out(fp32_x);
      out.save(residual_p + j);
    }
    float s_variance =
        1.0f / sqrtf(variance.reduce_sum() / (float)hidden_size + epsilon);
    vec_op::FP32Vec8 fp32_s_variance(s_variance);
    for (int j = 0; j < hidden_size; j += VEC_ELEM_NUM) {
      scalar_vec_t w(weight + j);
      scalar_vec_t res(residual_p + j);
      vec_op::FP32Vec8 fp32_w(w);
      vec_op::FP32Vec8 fp32_res(res);
      vec_op::FP32Vec8 fp32_out = fp32_res * fp32_s_variance * fp32_w;
      scalar_vec_t out(fp32_out);
      out.save(input_p + j);
    }
  }
 }
 } // namespace
 void rms_norm(torch::Tensor &out, torch::Tensor &input,
                  torch::Tensor &weight, float epsilon) {
  int hidden_size = input.size(-1);
  int num_tokens = input.numel() / hidden_size;
  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_impl", [&] {
    CPU_KERNEL_GUARD_IN(rms_norm_impl)
    rms_norm_impl(out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
                      weight.data_ptr<scalar_t>(), epsilon, num_tokens,
                      hidden_size);
    CPU_KERNEL_GUARD_OUT(rms_norm_impl)
  });
 }
 void fused_add_rms_norm(torch::Tensor &input, torch::Tensor &residual,
                            torch::Tensor &weight, float epsilon) {
  int hidden_size = input.size(-1);
  int num_tokens = input.numel() / hidden_size;
  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "fused_add_rms_norm_impl", [&] {
        CPU_KERNEL_GUARD_IN(fused_add_rms_norm_impl)
        fused_add_rms_norm_impl(
            input.data_ptr<scalar_t>(), residual.data_ptr<scalar_t>(),
            weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
        CPU_KERNEL_GUARD_OUT(fused_add_rms_norm_impl)
      });
 }
--- a/csrc/cpu/pos_encoding.cpp
+++ b/csrc/cpu/pos_encoding.cpp
@@ -0,0 +1,199 @@
 #include "cpu_types.hpp"
 namespace {
 template <typename scalar_t>
 void rotary_embedding_impl(
    const int64_t
        *__restrict__ positions, // [batch_size, seq_len] or [num_tokens]
    scalar_t
        *__restrict__ query, /// [batch_size, seq_len, num_heads, head_size] or
                             /// [num_tokens, num_heads, head_size]
    scalar_t
        *__restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or
                           // [num_tokens, num_kv_heads, head_size]
    const scalar_t
        *__restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2]
    const int rot_dim, const int64_t query_stride, const int64_t key_stride,
    const int num_heads, const int num_kv_heads, const int head_size,
    const int num_tokens) {
  using scalar_vec_t = vec_op::vec_t<scalar_t>;
  constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
  constexpr int ELEM_SIZE = sizeof(scalar_t);
  const int embed_dim = rot_dim / 2;
  TORCH_CHECK(embed_dim % VEC_ELEM_NUM == 0);
 #pragma omp parallel for
  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
    int64_t pos = positions[token_idx];
    const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim;
    for (int i = 0; i < num_heads; ++i) {
      const int head_idx = i;
      const int64_t token_head =
          token_idx * query_stride + head_idx * head_size;
      for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) {
        const int rot_offset = j;
        const int x_index = rot_offset;
        const int y_index = embed_dim + rot_offset;
        const int64_t out_x = token_head + x_index;
        const int64_t out_y = token_head + y_index;
        const scalar_vec_t cos(cache_ptr + x_index);
        const scalar_vec_t sin(cache_ptr + y_index);
        const scalar_vec_t q_x(query + out_x);
        const scalar_vec_t q_y(query + out_y);
        vec_op::FP32Vec8 fp32_cos(cos);
        vec_op::FP32Vec8 fp32_sin(sin);
        vec_op::FP32Vec8 fp32_q_x(q_x);
        vec_op::FP32Vec8 fp32_q_y(q_y);
        auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
        scalar_vec_t(out1).save(query + out_x);
        auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
        scalar_vec_t(out2).save(query + out_y);
      }
    }
    for (int i = 0; i < num_kv_heads; ++i) {
      const int head_idx = i;
      const int64_t token_head = token_idx * key_stride + head_idx * head_size;
      for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) {
        const int rot_offset = j;
        const int x_index = rot_offset;
        const int y_index = embed_dim + rot_offset;
        const int64_t out_x = token_head + x_index;
        const int64_t out_y = token_head + y_index;
        const scalar_vec_t cos(cache_ptr + x_index);
        const scalar_vec_t sin(cache_ptr + y_index);
        const scalar_vec_t k_x(key + out_x);
        const scalar_vec_t k_y(key + out_y);
        vec_op::FP32Vec8 fp32_cos(cos);
        vec_op::FP32Vec8 fp32_sin(sin);
        vec_op::FP32Vec8 fp32_k_x(k_x);
        vec_op::FP32Vec8 fp32_k_y(k_y);
        auto out1 = fp32_k_x * fp32_cos - fp32_k_y * fp32_sin;
        scalar_vec_t(out1).save(key + out_x);
        auto out2 = fp32_k_y * fp32_cos + fp32_k_x * fp32_sin;
        scalar_vec_t(out2).save(key + out_y);
      }
    }
  }
 }
 template <typename scalar_t>
 void rotary_embedding_gptj_impl(
    const int64_t
        *__restrict__ positions, // [batch_size, seq_len] or [num_tokens]
    scalar_t
        *__restrict__ query, /// [batch_size, seq_len, num_heads, head_size] or
                             /// [num_tokens, num_heads, head_size]
    scalar_t
        *__restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or
                           // [num_tokens, num_kv_heads, head_size]
    const scalar_t
        *__restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2]
    const int rot_dim, const int64_t query_stride, const int64_t key_stride,
    const int num_heads, const int num_kv_heads, const int head_size,
    const int num_tokens) {
  const int embed_dim = rot_dim / 2;
 #pragma omp parallel for collapse(2)
  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
    for (int i = 0; i < num_heads; ++i) {
      int64_t pos = positions[token_idx];
      const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim;
      const scalar_t *cos_cache_ptr = cache_ptr;
      const scalar_t *sin_cache_ptr = cache_ptr + embed_dim;
      const int head_idx = i;
      const int64_t token_head =
          token_idx * query_stride + head_idx * head_size;
      scalar_t *head_query = token_head + query;
      for (int j = 0; j < embed_dim; j += 1) {
        const int rot_offset = j;
        const int x_index = 2 * rot_offset;
        const int y_index = 2 * rot_offset + 1;
        const float cos = cos_cache_ptr[rot_offset];
        const float sin = sin_cache_ptr[rot_offset];
        const float x = head_query[x_index];
        const float y = head_query[y_index];
        head_query[x_index] = x * cos - y * sin;
        head_query[y_index] = y * cos + x * sin;
      }
    }
  }
 #pragma omp parallel for collapse(2)
  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
    for (int i = 0; i < num_kv_heads; ++i) {
      int64_t pos = positions[token_idx];
      const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim;
      const scalar_t *cos_cache_ptr = cache_ptr;
      const scalar_t *sin_cache_ptr = cache_ptr + embed_dim;
      const int head_idx = i;
      const int64_t token_head = token_idx * key_stride + head_idx * head_size;
      scalar_t *head_key = key + token_head;
      for (int j = 0; j < embed_dim; j += 1) {
        const int rot_offset = j;
        const int x_index = 2 * rot_offset;
        const int y_index = 2 * rot_offset + 1;
        const float cos = cos_cache_ptr[rot_offset];
        const float sin = sin_cache_ptr[rot_offset];
        const float x = head_key[x_index];
        const float y = head_key[y_index];
        head_key[x_index] = x * cos - y * sin;
        head_key[y_index] = y * cos + x * sin;
      }
    }
  }
 }
 }; // namespace
 void rotary_embedding(torch::Tensor &positions, torch::Tensor &query,
                          torch::Tensor &key, int head_size,
                          torch::Tensor &cos_sin_cache, bool is_neox) {
  int num_tokens = query.numel() / query.size(-1);
  int rot_dim = cos_sin_cache.size(1);
  int num_heads = query.size(-1) / head_size;
  int num_kv_heads = key.size(-1) / head_size;
  int64_t key_stride = key.stride(-2);
  int64_t query_stride = query.stride(-2);
  VLLM_DISPATCH_FLOATING_TYPES(
      query.scalar_type(), "rotary_embedding_impl", [&] {
        CPU_KERNEL_GUARD_IN(rotary_embedding_impl)
        if (is_neox) {
          rotary_embedding_impl(
              positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
              key.data_ptr<scalar_t>(), cos_sin_cache.data_ptr<scalar_t>(),
              rot_dim, query_stride, key_stride, num_heads, num_kv_heads,
              head_size, num_tokens);
        } else {
          rotary_embedding_gptj_impl(
              positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
              key.data_ptr<scalar_t>(), cos_sin_cache.data_ptr<scalar_t>(),
              rot_dim, query_stride, key_stride, num_heads, num_kv_heads,
              head_size, num_tokens);
        }
        CPU_KERNEL_GUARD_OUT(rotary_embedding_impl)
      });
 }
--- a/csrc/cpu/pybind.cpp
+++ b/csrc/cpu/pybind.cpp
@@ -0,0 +1,73 @@
 #include "cache.h"
 #include "cuda_utils.h"
 #include "ops.h"
 #include <torch/extension.h>
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  // vLLM custom ops
  pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
  // Attention ops
  ops.def(
    "paged_attention_v1",
    &paged_attention_v1,
    "Compute the attention between an input query and the cached keys/values using PagedAttention.");
  ops.def(
    "paged_attention_v2",
    &paged_attention_v2,
    "PagedAttention V2.");
  // Activation ops
  ops.def(
    "silu_and_mul",
    &silu_and_mul,
    "Activation function used in SwiGLU.");
  ops.def(
    "gelu_and_mul",
    &gelu_and_mul,
    "Activation function used in GeGLU with `none` approximation.");
  ops.def(
    "gelu_tanh_and_mul",
    &gelu_tanh_and_mul,
    "Activation function used in GeGLU with `tanh` approximation.");
  ops.def(
    "gelu_new",
    &gelu_new,
    "GELU implementation used in GPT-2.");
  ops.def(
    "gelu_fast",
    &gelu_fast,
    "Approximate GELU implementation.");
  // Layernorm
  ops.def(
    "rms_norm",
    &rms_norm,
    "Apply Root Mean Square (RMS) Normalization to the input tensor.");
  ops.def(
    "fused_add_rms_norm",
    &fused_add_rms_norm,
    "In-place fused Add and RMS Normalization");
  // Rotary embedding
  ops.def(
    "rotary_embedding",
    &rotary_embedding,
    "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
  // Cache ops
  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
  cache_ops.def(
    "swap_blocks",
    &swap_blocks,
    "Swap in (out) the cache blocks from src to dst");
  cache_ops.def(
    "copy_blocks",
    &copy_blocks,
    "Copy the cache blocks from src to dst");
  cache_ops.def(
    "reshape_and_cache",
    &reshape_and_cache,
    "Reshape the key and value tensors and cache them");
 }
--- a/csrc/cuda_compat.h
+++ b/csrc/cuda_compat.h
@@ -1,5 +1,15 @@
 #pragma once
 #ifdef USE_ROCM
 #include <hip/hip_runtime.h>
 #endif
 #ifndef USE_ROCM
  #define WARP_SIZE 32
 #else
  #define WARP_SIZE warpSize
 #endif
 #ifndef USE_ROCM
  #define VLLM_LDG(arg) __ldg(arg)
 #else
--- a/csrc/custom_all_reduce.cu
+++ b/csrc/custom_all_reduce.cu
@@ -29,7 +29,7 @@ fptr_t init_custom_ar(torch::Tensor &meta, torch::Tensor &rank_data,
    std::memcpy(&ipc_handles[i], handles[i].data(), sizeof(cudaIpcMemHandle_t));
  }
  return (fptr_t) new vllm::CustomAllreduce(
-      reinterpret_cast<vllm::Metadata *>(meta.data_ptr()), rank_data.data_ptr(),
+      reinterpret_cast<vllm::Signal *>(meta.data_ptr()), rank_data.data_ptr(),
      rank_data.numel(), ipc_handles, offsets, rank, full_nvlink);
 }
@@ -62,9 +62,9 @@ bool should_custom_ar(torch::Tensor &inp, int max_size, int world_size,
  if (inp_size % 16 != 0) return false;
  if (!_is_weak_contiguous(inp)) return false;
  if (world_size == 2 || full_nvlink) return inp_size <= max_size;
-  // 4 PCIE GPUs use 2 stage allreduce, and is only faster than NCCL when size
+  // for 4 or more non NVLink-capable GPUs, custom allreduce provides little
-  // <= 512k
+  // performance improvement over NCCL.
-  return world_size <= 4 && inp_size <= 512 * 1024;
+  return false;
 }
 void _all_reduce(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out,
@@ -126,7 +126,7 @@ void dispose(fptr_t _fa) {
  delete fa;
 }
-int meta_size() { return sizeof(vllm::Metadata); }
+int meta_size() { return sizeof(vllm::Signal); }
 void register_buffer(fptr_t _fa, torch::Tensor &t,
                     const std::vector<std::string> &handles,
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@@ -23,29 +23,17 @@
 namespace vllm {
 constexpr int kMaxBlocks = 64;
 // note: we don't want to use atomics for signals because peer atomics are no
 // supported on PCIe links
 struct Signal {
-  alignas(64) union {
+  alignas(128) uint32_t start[kMaxBlocks][8];
-    uint64_t flag;
+  alignas(128) uint32_t end[kMaxBlocks][8];
    unsigned char data[8];
  } start;
  alignas(64) union {
    uint64_t flag;
    unsigned char data[8];
  } end;
 };
 struct Metadata {
  alignas(128) Signal sg;
  alignas(128) int counter;
 };
 static_assert(offsetof(Metadata, counter) == 128);
 static_assert(sizeof(Metadata) == 256);
 struct __align__(16) RankData { const void *__restrict__ ptrs[8]; };
-struct RankSignals {
+struct __align__(16) RankSignals { volatile Signal *signals[8]; };
  volatile Signal *signals[8];
 };
 // like std::array, but aligned
 template <typename T, int sz>
@@ -135,70 +123,49 @@ DINLINE O downcast(array_t<float, O::size> val) {
  }
 }
-// compute flag at compile time
+// This function is meant to be used as the first synchronization in the all
-__host__ __device__ constexpr uint64_t compute_flag(int ngpus) {
+// reduce kernel. Thus, it doesn't need to make any visibility guarantees for
-  auto m = std::numeric_limits<uint64_t>::max();
+// prior memory accesses. Note: volatile writes will not be reordered against
-  return m >> ((8 - ngpus) * 8);
+// other volatile writes.
 }
 template <int ngpus>
-DINLINE void start_sync(const RankSignals &sg, volatile Metadata *meta,
+DINLINE void start_sync(const RankSignals &sg, volatile Signal *self_sg,
                        int rank) {
-  constexpr auto FLAG = compute_flag(ngpus);
+  if (threadIdx.x < ngpus) {
-  if (blockIdx.x == 0) {
+    // reset flag for next time
-    if (threadIdx.x < ngpus)
+    self_sg->end[blockIdx.x][threadIdx.x] = 0;
-      // simultaneously write to the corresponding byte to all other ranks.
+    // simultaneously write to the corresponding flag of all ranks.
-      // Latency = 1 p2p write
+    // Latency = 1 p2p write
-      sg.signals[threadIdx.x]->start.data[rank] = 255;
+    sg.signals[threadIdx.x]->start[blockIdx.x][rank] = 1;
-    else if (threadIdx.x == 32)
+    // wait until we got true from all ranks
-      // reset
+    while (!self_sg->start[blockIdx.x][threadIdx.x])
      meta->sg.end.flag = 0;
  }
  if (threadIdx.x == 0) {
    while (meta->sg.start.flag != FLAG)
      ;
  }
  __syncthreads();
 }
 // This function is meant to be used as the second or the final synchronization
 // barrier in the all reduce kernel. If it's the final synchronization barrier,
 // we don't need to make any visibility guarantees for prior memory accesses.
 template <int ngpus, bool final_sync = false>
-DINLINE void end_sync(const RankSignals &sg, volatile Metadata *meta,
+DINLINE void end_sync(const RankSignals &sg, volatile Signal *self_sg,
                      int rank) {
  constexpr auto FLAG = compute_flag(ngpus);
  __syncthreads();
-  __shared__ int num;
+  // eliminate the case that prior writes are not visible after signals become
-  if (threadIdx.x == 0) num = atomicAdd((int *)&meta->counter, 1);
+  // visible. Note that I did not managed to make this happen through a lot of
-  __syncthreads();
+  // testing. Might be the case that hardware provides stronger guarantee than
-
+  // the memory model. 
-  // Only the last completing block can perform the end synchronization
+  if constexpr (!final_sync) __threadfence_system();
-  // This can ensures when the final busy wait ends, all ranks must have
+  if (threadIdx.x < ngpus) {
-  // finished reading each other's buffer.
+    // reset flag for next time
-  if (num == gridDim.x - 1) {
+    self_sg->start[blockIdx.x][threadIdx.x] = 0;
-    if (threadIdx.x == 32) {
+    // simultaneously write to the corresponding flag of all ranks.
-      // reset in a different warp
+    // Latency = 1 p2p write
-      meta->counter = 0;
+    sg.signals[threadIdx.x]->end[blockIdx.x][rank] = 1;
-      meta->sg.start.flag = 0;
+    // wait until we got true from all ranks
-    } else if (threadIdx.x < ngpus) {
+    while (!self_sg->end[blockIdx.x][threadIdx.x])
-      // simultaneously write to the corresponding byte to all other ranks.
+      ;
      // Latency = 1 p2p write
      sg.signals[threadIdx.x]->end.data[rank] = 255;
    }
    // if this is the final sync, only one block needs it
    // because kernel exit can serve as sync
    if constexpr (final_sync) {
      if (threadIdx.x == 0) {
        while (meta->sg.end.flag != FLAG)
          ;
      }
    }
  }
  if constexpr (!final_sync) {
    if (threadIdx.x == 0) {
      while (meta->sg.end.flag != FLAG)
        ;
    }
    __syncthreads();
  }
  if constexpr (!final_sync) __syncthreads();
 }
 template <typename P, int ngpus, typename A>
@@ -214,32 +181,32 @@ DINLINE P packed_reduce(const P *ptrs[], int idx) {
 template <typename T, int ngpus>
 __global__ void __launch_bounds__(512, 1)
    cross_device_reduce_1stage(RankData *_dp, RankSignals sg,
-                               volatile Metadata *meta, T *__restrict__ result,
+                               volatile Signal *self_sg, T *__restrict__ result,
                               int rank, int size) {
  using P = typename packed_t<T>::P;
  using A = typename packed_t<T>::A;
  // note: we don't reorder the address so the accumulation order is the same
  // for all ranks, ensuring bitwise identical results
  auto dp = *_dp;
-  start_sync<ngpus>(sg, meta, rank);
+  start_sync<ngpus>(sg, self_sg, rank);
  // do the actual reduction
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
       idx += gridDim.x * blockDim.x) {
    ((P *)result)[idx] =
        packed_reduce<P, ngpus, A>((const P **)&dp.ptrs[0], idx);
  }
-  end_sync<ngpus, true>(sg, meta, rank);
+  end_sync<ngpus, true>(sg, self_sg, rank);
 }
 template <typename P>
 DINLINE P *get_tmp_buf(volatile Signal *sg) {
-  return (P *)(((Metadata *)sg) + 1);
+  return (P *)(((Signal *)sg) + 1);
 }
 template <typename T, int ngpus>
 __global__ void __launch_bounds__(512, 1)
    cross_device_reduce_2stage(RankData *_dp, RankSignals sg,
-                               volatile Metadata *meta, T *__restrict__ result,
+                               volatile Signal *self_sg, T *__restrict__ result,
                               int rank, int size) {
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = gridDim.x * blockDim.x;
@@ -248,6 +215,7 @@ __global__ void __launch_bounds__(512, 1)
  int part = size / ngpus;
  int start = rank * part;
  int end = rank == ngpus - 1 ? size : start + part;
  int largest_part = part + size % ngpus;
  const P *ptrs[ngpus];
  P *tmps[ngpus];
 #pragma unroll
@@ -257,75 +225,28 @@ __global__ void __launch_bounds__(512, 1)
    tmps[i] = get_tmp_buf<P>(sg.signals[target]);
  }
  auto tmp_out = tmps[0];
-  start_sync<ngpus>(sg, meta, rank);
+  start_sync<ngpus>(sg, self_sg, rank);
  // stage 1: reduce scatter
  for (int idx = start + tid; idx < end; idx += stride) {
    tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
  }
-  // Maybe TODO: replace this with per-block release-acquire
+  end_sync<ngpus>(sg, self_sg, rank);
  // can save about 1-2us (not a lot though)
  end_sync<ngpus>(sg, meta, rank);
-  // stage 2: allgather
+  // stage 2: allgather. Note: it's important to match the tid between
-  for (int idx = tid; idx < part; idx += stride) {
+  // the two stages, because visibility across devices is only guaranteed
  // between threads that have the same tid. If thread i computes the sum of
  // start + i in the first stage, then thread i also gathers start + i from all
  // ranks.
  for (int idx = tid; idx < largest_part; idx += stride) {
 #pragma unroll
    for (int i = 0; i < ngpus; i++) {
-      int dst_idx = ((rank + i) % ngpus) * part + idx;
+      int gather_from_rank = ((rank + i) % ngpus);
-      ((P *)result)[dst_idx] = tmps[i][idx];
+      if (gather_from_rank == ngpus - 1 || idx < part) {
        int dst_idx = gather_from_rank * part + idx;
        ((P *)result)[dst_idx] = tmps[i][idx];
      }
    }
  }
  // process the last larger partition
  int remaining = size - part * ngpus;
  if (tid < remaining) {
    int dst_idx = tid + part * ngpus;
    ((P *)result)[dst_idx] = get_tmp_buf<P>(sg.signals[ngpus - 1])[part + tid];
  }
  // faster than this
  // for (int idx = tid; idx < size; idx += stride) {
  //   int target_rank = idx / part;
  //   if (target_rank == ngpus) target_rank -= 1;
  //   ((P *)result)[idx] = tmps[target_rank][idx - target_rank * part];
  // }
 }
 template <typename T, int ngpus>
 __global__ void __launch_bounds__(512, 1)
    cross_device_reduce_half_butterfly(RankData *_dp, RankSignals sg,
                                       volatile Metadata *meta,
                                       T *__restrict__ result, int rank,
                                       int size) {
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = gridDim.x * blockDim.x;
  using P = typename packed_t<T>::P;
  using A = typename packed_t<T>::A;
  auto tmp_out = get_tmp_buf<P>(sg.signals[rank]);
  constexpr int hg = ngpus / 2;
  // Actually not quite half butterfly.
  // This is an all-to-all within each group containing half of the ranks
  // followed by cross-group add. Equivalent to half butterfly when there
  // are 4 GPUs, a common case for PCIe cards like T4 and A10.
  const P *ptrs[hg];
  {
    int start = rank - rank % hg;
 #pragma unroll
    for (int i = 0; i < hg; i++) {
      ptrs[i] = (const P *)_dp->ptrs[i + start];
    }
  }
  start_sync<ngpus>(sg, meta, rank);
  for (int idx = tid; idx < size; idx += stride) {
    tmp_out[idx] = packed_reduce<P, hg, A>(ptrs, idx);
  }
  end_sync<ngpus>(sg, meta, rank);
  auto src = get_tmp_buf<P>(sg.signals[(ngpus - 1) - rank % ngpus]);
  // do the cross group reduction
  for (int idx = tid; idx < size; idx += stride) {
    auto tmp = tmp_out[idx];
    packed_assign_add(tmp, src[idx]);
    ((P *)result)[idx] = tmp;
  }
 }
 using IPC_KEY = std::array<uint8_t, sizeof(cudaIpcMemHandle_t)>;
@@ -341,7 +262,7 @@ class CustomAllreduce {
  // below are device pointers
  RankSignals sg_;
  std::unordered_map<void *, RankData *> buffers_;
-  Metadata *meta_;
+  Signal *self_sg_;
  // stores the registered device pointers from all ranks
  RankData *d_rank_data_base_, *d_rank_data_end_;
@@ -352,32 +273,32 @@ class CustomAllreduce {
  /**
   * meta is a pointer to device metadata and temporary buffer for allreduce.
   *
-   * There's a total of sizeof(Metadata) of prefix before the actual data,
+   * There's a total of sizeof(Signal) of prefix before the actual data,
   * so meta + 1 points to actual temporary buffer.
   *
   * note: this class does not own any device memory. Any required buffers
   * are passed in from the constructor
   */
-  CustomAllreduce(Metadata *meta, void *rank_data, size_t rank_data_sz,
+  CustomAllreduce(Signal *meta, void *rank_data, size_t rank_data_sz,
                  const cudaIpcMemHandle_t *handles,
                  const std::vector<int64_t> &offsets, int rank,
                  bool full_nvlink = true)
      : rank_(rank),
        world_size_(offsets.size()),
        full_nvlink_(full_nvlink),
-        meta_(meta),
+        self_sg_(meta),
        d_rank_data_base_(reinterpret_cast<RankData *>(rank_data)),
        d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
    for (int i = 0; i < world_size_; i++) {
-      Metadata *rank_meta;
+      Signal *rank_sg;
      if (i != rank_) {
        char *handle = open_ipc_handle(&handles[i]);
        handle += offsets[i];
-        rank_meta = (Metadata *)handle;
+        rank_sg = (Signal *)handle;
      } else {
-        rank_meta = meta_;
+        rank_sg = self_sg_;
      }
-      sg_.signals[i] = &rank_meta->sg;
+      sg_.signals[i] = rank_sg;
    }
  }
@@ -492,6 +413,10 @@ class CustomAllreduce {
          "custom allreduce currently requires input length to be multiple "
          "of " +
          std::to_string(d));
    if (block_limit > kMaxBlocks)
      throw std::runtime_error("max supported block limit is " +
                               std::to_string(kMaxBlocks) + ". Got " +
                               std::to_string(block_limit));
    RankData *ptrs;
    cudaStreamCaptureStatus status;
@@ -512,9 +437,9 @@ class CustomAllreduce {
    size /= d;
    auto bytes = size * sizeof(typename packed_t<T>::P);
    int blocks = std::min(block_limit, (size + threads - 1) / threads);
-#define KL(ngpus, name) \
+#define KL(ngpus, name)                                                       \
-  name<T, ngpus>        \
+  name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
-      <<<blocks, threads, 0, stream>>>(ptrs, sg_, meta_, output, rank_, size);
+                                                 rank_, size);
 #define REDUCE_CASE(ngpus)                            \
  case ngpus: {                                       \
    if (world_size_ == 2) {                           \
@@ -526,8 +451,6 @@ class CustomAllreduce {
      } else {                                        \
        KL(ngpus, cross_device_reduce_2stage);        \
      }                                               \
    } else {                                          \
      KL(ngpus, cross_device_reduce_half_butterfly);  \
    }                                                 \
    break;                                            \
  }
@@ -556,7 +479,7 @@ class CustomAllreduce {
 /**
 * To inspect PTX/SASS, copy paste this header file to compiler explorer and add
 a template instantiation:
- * template void CustomAllreduce::allreduce<half>(cudaStream_t, half *, half *,
+ * template void vllm::CustomAllreduce::allreduce<half>(cudaStream_t, half *,
- int, int, int);
+ half *, int, int, int);
 */
 }  // namespace vllm
--- a/csrc/custom_all_reduce_test.cu
+++ b/csrc/custom_all_reduce_test.cu
@@ -92,7 +92,7 @@ __global__ void gen_data(curandState_t *state, T *data, double *ground_truth,
 template <typename T>
 void run(int myRank, int nRanks, ncclComm_t &comm, int threads, int block_limit,
-         int data_size) {
+         int data_size, bool performance_test) {
  T *result;
  cudaStream_t stream;
  CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
@@ -101,7 +101,7 @@ void run(int myRank, int nRanks, ncclComm_t &comm, int threads, int block_limit,
  cudaIpcMemHandle_t self_data_handle;
  cudaIpcMemHandle_t data_handles[8];
-  vllm::Metadata *buffer;
+  vllm::Signal *buffer;
  T *self_data_copy;
  /**
   * Allocate IPC buffer
@@ -115,9 +115,9 @@ void run(int myRank, int nRanks, ncclComm_t &comm, int threads, int block_limit,
   * convenience.
   */
  CUDACHECK(
-      cudaMalloc(&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Metadata)));
+      cudaMalloc(&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
-  CUDACHECK(cudaMemset(buffer, 0,
+  CUDACHECK(
-                       2 * data_size * sizeof(T) + sizeof(vllm::Metadata)));
+      cudaMemset(buffer, 0, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
  CUDACHECK(cudaMalloc(&self_data_copy, data_size * sizeof(T)));
  CUDACHECK(cudaIpcGetMemHandle(&self_data_handle, buffer));
@@ -133,7 +133,7 @@ void run(int myRank, int nRanks, ncclComm_t &comm, int threads, int block_limit,
                           offsets, myRank);
  auto *self_data =
      reinterpret_cast<T *>(reinterpret_cast<char *>(buffer) +
-                            sizeof(vllm::Metadata) + data_size * sizeof(T));
+                            sizeof(vllm::Signal) + data_size * sizeof(T));
  // hack buffer registration
  {
    std::vector<std::string> handles;
@@ -143,8 +143,8 @@ void run(int myRank, int nRanks, ncclComm_t &comm, int threads, int block_limit,
      char *end = (char *)&data_handles[i + 1];
      handles.emplace_back(begin, end);
    }
-    std::vector<int64_t> offsets(
+    std::vector<int64_t> offsets(nRanks,
-        nRanks, sizeof(vllm::Metadata) + data_size * sizeof(T));
+                                 sizeof(vllm::Signal) + data_size * sizeof(T));
    fa.register_buffer(handles, offsets, self_data);
  }
@@ -169,81 +169,112 @@ void run(int myRank, int nRanks, ncclComm_t &comm, int threads, int block_limit,
  } else {
    ncclDtype = ncclFloat;
  }
  dummy_kernel<<<1, 1, 0, stream>>>();
  constexpr int warmup_iters = 5;
  constexpr int num_iters = 25;
  // warmup
  for (int i = 0; i < warmup_iters; i++) {
    NCCLCHECK(ncclAllReduce(result, result, data_size, ncclDtype, ncclSum, comm,
                            stream));
  }
  CUDACHECK(cudaEventRecord(start, stream));
  for (int i = 0; i < num_iters; i++) {
    NCCLCHECK(ncclAllReduce(result, result, data_size, ncclDtype, ncclSum, comm,
                            stream));
  }
  CUDACHECK(cudaEventRecord(stop, stream));
  CUDACHECK(cudaStreamSynchronize(stream));
  float allreduce_ms = 0;
  cudaEventElapsedTime(&allreduce_ms, start, stop);
  // if (myRank == 1) dummy_kernel<<<1, 1, 0, stream>>>();
  // set_data<T><<<16, 1024, 0, stream>>>(self_data, data_size, myRank);
  dummy_kernel<<<1, 1, 0, stream>>>();
  // warm up
  for (int i = 0; i < warmup_iters; i++) {
    fa.allreduce<T>(stream, self_data, result, data_size, threads, block_limit);
  }
  CUDACHECK(cudaEventRecord(start, stream));
  for (int i = 0; i < num_iters; i++) {
    fa.allreduce<T>(stream, self_data, result, data_size, threads, block_limit);
  }
  CUDACHECK(cudaEventRecord(stop, stream));
  CUDACHECK(cudaStreamSynchronize(stream));
  float duration_ms = 0;
  cudaEventElapsedTime(&duration_ms, start, stop);
  if (myRank == 0)
    printf(
        "Rank %d done, nGPUs:%d, sz (kb): %d, %d, %d, my time:%.2fus, nccl "
        "time:%.2fus\n",
        myRank, nRanks, data_size * sizeof(T) / 1024, threads, block_limit,
        duration_ms * 1e3 / num_iters, allreduce_ms * 1e3 / num_iters);
  // And wait for all the queued up work to complete
  CUDACHECK(cudaStreamSynchronize(stream));
  NCCLCHECK(ncclAllReduce(self_data_copy, self_data, data_size, ncclDtype,
                          ncclSum, comm, stream));
  double *nccl_result, *my_result;
  CUDACHECK(cudaMallocHost(&nccl_result, data_size * sizeof(double)));
  CUDACHECK(cudaMallocHost(&my_result, data_size * sizeof(double)));
-
+  if (performance_test) {
-  convert_data<T><<<108, 1024, 0, stream>>>(self_data, result, nccl_result,
+    dummy_kernel<<<1, 1, 0, stream>>>();
-                                            my_result, data_size);
+    constexpr int warmup_iters = 5;
-  CUDACHECK(cudaStreamSynchronize(stream));
+    constexpr int num_iters = 100;
-
+    // warmup
-  for (unsigned long j = 0; j < data_size; j++) {
+    for (int i = 0; i < warmup_iters; i++) {
-    auto diff = abs(nccl_result[j] - my_result[j]);
+      NCCLCHECK(ncclAllReduce(result, result, data_size, ncclDtype, ncclSum,
-    if (diff >= 1e-2) {
+                              comm, stream));
      printf("Rank %d: Verification mismatch at %lld: %f != (my) %f, gt=%f\n",
             myRank, j, nccl_result[j], my_result[j], ground_truth[j]);
      break;
    }
-  }
+    CUDACHECK(cudaEventRecord(start, stream));
    for (int i = 0; i < num_iters; i++) {
      NCCLCHECK(ncclAllReduce(result, result, data_size, ncclDtype, ncclSum,
                              comm, stream));
    }
    CUDACHECK(cudaEventRecord(stop, stream));
    CUDACHECK(cudaStreamSynchronize(stream));
    float allreduce_ms = 0;
    cudaEventElapsedTime(&allreduce_ms, start, stop);
-  long double nccl_diffs = 0.0;
+    dummy_kernel<<<1, 1, 0, stream>>>();
-  long double my_diffs = 0.0;
+    // warm up
-  for (int j = 0; j < data_size; j++) {
+    for (int i = 0; i < warmup_iters; i++) {
-    nccl_diffs += abs(nccl_result[j] - ground_truth[j]);
+      fa.allreduce<T>(stream, self_data, result, data_size, threads,
-    my_diffs += abs(my_result[j] - ground_truth[j]);
+                      block_limit);
    }
    CUDACHECK(cudaEventRecord(start, stream));
    for (int i = 0; i < num_iters; i++) {
      fa.allreduce<T>(stream, self_data, result, data_size, threads,
                      block_limit);
    }
    CUDACHECK(cudaEventRecord(stop, stream));
    CUDACHECK(cudaStreamSynchronize(stream));
    float duration_ms = 0;
    cudaEventElapsedTime(&duration_ms, start, stop);
    if (myRank == 0)
      printf(
          "Rank %d done, nGPUs:%d, sz (kb): %d, %d, %d, my time:%.2fus, nccl "
          "time:%.2fus\n",
          myRank, nRanks, data_size * sizeof(T) / 1024, threads, block_limit,
          duration_ms * 1e3 / num_iters, allreduce_ms * 1e3 / num_iters);
    // And wait for all the queued up work to complete
    CUDACHECK(cudaStreamSynchronize(stream));
    NCCLCHECK(ncclAllReduce(self_data_copy, self_data, data_size, ncclDtype,
                            ncclSum, comm, stream));
    convert_data<T><<<108, 1024, 0, stream>>>(self_data, result, nccl_result,
                                              my_result, data_size);
    CUDACHECK(cudaStreamSynchronize(stream));
    for (unsigned long j = 0; j < data_size; j++) {
      auto diff = abs(nccl_result[j] - my_result[j]);
      if (diff >= 4e-2) {
        printf("Rank %d: Verification mismatch at %lld: %f != (my) %f, gt=%f\n",
               myRank, j, nccl_result[j], my_result[j], ground_truth[j]);
        break;
      }
    }
    long double nccl_diffs = 0.0;
    long double my_diffs = 0.0;
    for (int j = 0; j < data_size; j++) {
      nccl_diffs += abs(nccl_result[j] - ground_truth[j]);
      my_diffs += abs(my_result[j] - ground_truth[j]);
    }
    if (myRank == 0)
      std::cout << "average abs diffs: nccl: " << nccl_diffs / data_size
                << " me: " << my_diffs / data_size << std::endl;
  } else {
    for (int i = 0; i < 100; i++) {
      fa.allreduce<T>(stream, self_data, result, data_size, threads,
                      block_limit);
      CUDACHECK(cudaStreamSynchronize(stream));
      NCCLCHECK(ncclAllReduce(self_data, self_data_copy, data_size, ncclDtype,
                              ncclSum, comm, stream));
      convert_data<T><<<108, 1024, 0, stream>>>(
          self_data_copy, result, nccl_result, my_result, data_size);
      CUDACHECK(cudaStreamSynchronize(stream));
      for (unsigned long j = 0; j < data_size; j++) {
        auto diff = abs(nccl_result[j] - my_result[j]);
        if (diff >= 4e-2) {
          printf(
              "Rank %d: Verification mismatch at %lld: %f != (my) %f, gt=%f\n",
              myRank, j, nccl_result[j], my_result[j], ground_truth[j]);
          break;
        }
      }
    }
    if (myRank == 0)
      printf("Test passed: nGPUs:%d, sz (kb): %d, %d, %d\n", nRanks,
             data_size * sizeof(T) / 1024, threads, block_limit);
    // long double nccl_diffs = 0.0;
    // long double my_diffs = 0.0;
    // for (int j = 0; j < data_size; j++) {
    //   nccl_diffs += abs(nccl_result[j] - ground_truth[j]);
    //   my_diffs += abs(my_result[j] - ground_truth[j]);
    // }
    // if (myRank == 0)
    //   std::cout << "average abs diffs: nccl: " << nccl_diffs / data_size
    //             << " me: " << my_diffs / data_size << std::endl;
  }
  if (myRank == 0)
    std::cout << "average abs diffs: nccl: " << nccl_diffs / data_size
              << " me: " << my_diffs / data_size << std::endl;
  CUDACHECK(cudaFree(result));
  CUDACHECK(cudaFree(self_data_copy));
@@ -269,14 +300,15 @@ int main(int argc, char **argv) {
                     MPI_COMM_WORLD));
  NCCLCHECK(ncclCommInitRank(&comm, nRanks, id, myRank));
  bool performance_test = true;
  cudaProfilerStart();
  // for (int threads : {256, 512}) {
  //   for (int block_limit = 16; block_limit < 112; block_limit += 4) {
  //     run<half>(myRank, nRanks, comm, threads, block_limit, 4096 * 1024);
  //   }
  // }
-  for (int sz = 512; sz <= (32 << 20); sz *= 2) {
+  for (int sz = 512; sz <= (8 << 20); sz *= 2) {
-    run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 50);
+    run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 47, performance_test);
  }
  cudaProfilerStop();
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@@ -4,6 +4,16 @@
 #include "dispatch_utils.h"
 #include "reduction_utils.cuh"
 #ifndef USE_ROCM
  #include <cuda_bf16.h>
  #include <cuda_fp16.h>
 #else
  #include <hip/hip_bf16.h>
  #include <hip/hip_fp16.h>
  using __nv_bfloat16 = __hip_bfloat16;
  using __nv_bfloat162 = __hip_bfloat162;
 #endif
 namespace vllm {
@@ -35,9 +45,201 @@ __global__ void rms_norm_kernel(
  }
 }
-// TODO: Further optimize this kernel.
+
-template<typename scalar_t>
+/* Converter structs for the conversion from torch types to HIP/CUDA types,
-__global__ void fused_add_rms_norm_kernel(
+   and the associated type conversions within HIP/CUDA. These helpers need
   to be implemented for now because the relevant type conversion
   operators/constructors are not consistently implemented by HIP/CUDA, so
   a generic conversion via type casts cannot be implemented.
   Each struct should have the member static constexpr bool `exists`:
   If false, the optimized kernel is not used for the corresponding torch type.
   If true, the struct should be fully defined as shown in the examples below. 
 */
 template<typename torch_type>
 struct _typeConvert { static constexpr bool exists = false; };
 #if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000))
 // CUDA < 12.0 runs into issues with packed type conversion
 template<>
 struct _typeConvert<c10::Half> {
  static constexpr bool exists = true;
  using hip_type = __half;
  using packed_hip_type = __half2;
  __device__ static inline float convert(hip_type x) { return __half2float(x); }
  __device__ static inline float2 convert(packed_hip_type x) { return __half22float2(x); }
  __device__ static inline hip_type convert(float x) { return __float2half_rn(x); }
  __device__ static inline packed_hip_type convert(float2 x) { return __float22half2_rn(x); }
 };
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
 // CUDA_ARCH < 800 does not have BF16 support
 // TODO: Add in ROCm support once public headers handle bf16 maturely
 template<>
 struct _typeConvert<c10::BFloat16> {
  static constexpr bool exists = true;
  using hip_type = __nv_bfloat16;
  using packed_hip_type = __nv_bfloat162;
  __device__ static inline float convert(hip_type x) { return __bfloat162float(x); }
  __device__ static inline float2 convert(packed_hip_type x) { return __bfloat1622float2(x); }
  __device__ static inline hip_type convert(float x) { return __float2bfloat16(x); }
  __device__ static inline packed_hip_type convert(float2 x) { return __float22bfloat162_rn(x); }
 };
 #endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
 #endif // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000))
 /* Vector POD struct to generate vectorized and packed FP16/BF16 ops
   for appropriate specializations of fused_add_rms_norm_kernel.
   Only functions that are necessary in that kernel are implemented.
   Alignment to 16 bytes is required to use 128-bit global memory ops.
 */
 template<typename scalar_t, int width>
 struct alignas(16) _f16Vec {
  /* Not theoretically necessary that width is a power of 2 but should 
     almost always be the case for optimization purposes */ 
  static_assert(width > 0 && (width & (width - 1)) == 0,
                "Width is not a positive power of 2!");
  using Converter = _typeConvert<scalar_t>;
  using T1 = typename Converter::hip_type;
  using T2 = typename Converter::packed_hip_type;
  T1 data[width];
  __device__ _f16Vec& operator+=(const _f16Vec<scalar_t, width>& other) {
    if constexpr (width % 2 == 0) {
      #pragma unroll
      for (int i = 0; i < width; i += 2) {
        T2 temp{data[i], data[i+1]};
        temp += T2{other.data[i], other.data[i+1]};
        data[i] = temp.x;
        data[i+1] = temp.y;
      }
    } else {
      #pragma unroll
      for (int i = 0; i < width; ++i)
        data[i] += other.data[i];
    }
    return *this;
  }
  __device__ _f16Vec& operator*=(const _f16Vec<scalar_t, width>& other) {
    if constexpr (width % 2 == 0) {
      #pragma unroll
      for (int i = 0; i < width; i += 2) {
        T2 temp{data[i], data[i+1]};
        temp *= T2{other.data[i], other.data[i+1]};
        data[i] = temp.x;
        data[i+1] = temp.y;
      }
    } else {
      #pragma unroll
      for (int i = 0; i < width; ++i)
        data[i] *= other.data[i];
    }
    return *this;
  }
  __device__ _f16Vec& operator*=(const float scale) {
    if constexpr (width % 2 == 0) {
      #pragma unroll
      for (int i = 0; i < width; i += 2) {
        float2 temp_f = Converter::convert(T2{data[i], data[i+1]});
        temp_f.x *= scale;
        temp_f.y *= scale;
        T2 temp = Converter::convert(temp_f);
        data[i] = temp.x;
        data[i+1] = temp.y;
      }
    } else {
      #pragma unroll
      for (int i = 0; i < width; ++i) {
        float temp = Converter::convert(data[i]) * scale;
        data[i] = Converter::convert(temp);
      }
    }
    return *this;
  }
  __device__ float sum_squares() const {
    float result = 0.0f;
    if constexpr (width % 2 == 0) {
      #pragma unroll
      for (int i = 0; i < width; i += 2) {
        float2 z = Converter::convert(T2{data[i], data[i+1]});
        result += z.x * z.x + z.y * z.y;
      }
    } else {
      #pragma unroll
      for (int i = 0; i < width; ++i) {
        float x = Converter::convert(data[i]);
        result += x * x;
      }
    }
    return result;
  }
 };
 /* Function specialization in the case of FP16/BF16 tensors.
   Additional optimizations we can make in this case are
   packed and vectorized operations, which help with the
   memory latency bottleneck. */
 template<typename scalar_t, int width>
 __global__ std::enable_if_t<
  (width > 0) && _typeConvert<scalar_t>::exists> fused_add_rms_norm_kernel(
  scalar_t* __restrict__ input,           // [..., hidden_size]
  scalar_t* __restrict__ residual,        // [..., hidden_size]
  const scalar_t* __restrict__ weight,    // [hidden_size]
  const float epsilon,
  const int num_tokens,
  const int hidden_size) {
  // Sanity checks on our vector struct and type-punned pointer arithmetic
  static_assert(std::is_pod_v<_f16Vec<scalar_t, width>>);
  static_assert(sizeof(_f16Vec<scalar_t, width>) == sizeof(scalar_t) * width);
  const int vec_hidden_size = hidden_size / width;
  __shared__ float s_variance;
  float variance = 0.0f;
  /* These and the argument pointers are all declared `restrict` as they are
     not aliased in practice. Argument pointers should not be dereferenced
     in this kernel as that would be undefined behavior */
  auto* __restrict__ input_v = reinterpret_cast<_f16Vec<scalar_t, width>*>(input);
  auto* __restrict__ residual_v = reinterpret_cast<_f16Vec<scalar_t, width>*>(residual);
  auto* __restrict__ weight_v = reinterpret_cast<const _f16Vec<scalar_t, width>*>(weight);
  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
    int id = blockIdx.x * vec_hidden_size + idx;
    _f16Vec<scalar_t, width> temp = input_v[id];
    temp += residual_v[id];
    variance += temp.sum_squares();
    residual_v[id] = temp;
  }
  /* Keep the following if-else block in sync with the
     calculation of max_block_size in fused_add_rms_norm */ 
  if (num_tokens < 256) {
    variance = blockReduceSum<float, 1024>(variance);
  } else variance = blockReduceSum<float, 256>(variance);
  if (threadIdx.x == 0) {
    s_variance = rsqrtf(variance / hidden_size + epsilon);
  }
  __syncthreads();
  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
    int id = blockIdx.x * vec_hidden_size + idx;
    _f16Vec<scalar_t, width> temp = residual_v[id];
    temp *= s_variance;
    temp *= weight_v[idx];
    input_v[id] = temp;
  }
 }
 /* Generic fused_add_rms_norm_kernel
   The width field is not used here but necessary for other specializations.
 */
 template<typename scalar_t, int width>
 __global__ std::enable_if_t<
  (width == 0) || !_typeConvert<scalar_t>::exists> fused_add_rms_norm_kernel(
  scalar_t* __restrict__ input,           // [..., hidden_size]
  scalar_t* __restrict__ residual,        // [..., hidden_size]
  const scalar_t* __restrict__ weight,    // [hidden_size]
@@ -48,12 +250,17 @@ __global__ void fused_add_rms_norm_kernel(
  float variance = 0.0f;
  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    float x = (float) input[blockIdx.x * hidden_size + idx];
+    scalar_t z = input[blockIdx.x * hidden_size + idx];
-    x += (float) residual[blockIdx.x * hidden_size + idx];
+    z += residual[blockIdx.x * hidden_size + idx];
    float x = (float) z;
    variance += x * x;
-    residual[blockIdx.x * hidden_size + idx] = (scalar_t) x;
+    residual[blockIdx.x * hidden_size + idx] = z;
  }
-  variance = blockReduceSum<float>(variance);
+  /* Keep the following if-else block in sync with the
     calculation of max_block_size in fused_add_rms_norm */ 
  if (num_tokens < 256) {
    variance = blockReduceSum<float, 1024>(variance);
  } else variance = blockReduceSum<float, 256>(variance);
  if (threadIdx.x == 0) {
    s_variance = rsqrtf(variance / hidden_size + epsilon);
  }
@@ -93,6 +300,21 @@ void rms_norm(
    });
 }
 #define LAUNCH_FUSED_ADD_RMS_NORM(width)              \
  VLLM_DISPATCH_FLOATING_TYPES(                       \
    input.scalar_type(),                              \
    "fused_add_rms_norm_kernel",                      \
    [&] {                                             \
      vllm::fused_add_rms_norm_kernel                 \
      <scalar_t, width><<<grid, block, 0, stream>>>(  \
        input.data_ptr<scalar_t>(),                   \
        residual.data_ptr<scalar_t>(),                \
        weight.data_ptr<scalar_t>(),                  \
        epsilon,                                      \
        num_tokens,                                   \
        hidden_size);                                 \
    });
 void fused_add_rms_norm(
  torch::Tensor& input,    // [..., hidden_size]
  torch::Tensor& residual, // [..., hidden_size]
@@ -102,19 +324,29 @@ void fused_add_rms_norm(
  int num_tokens = input.numel() / hidden_size;
  dim3 grid(num_tokens);
-  dim3 block(std::min(hidden_size, 1024));
+  /* This kernel is memory-latency bound in many scenarios.
     When num_tokens is large, a smaller block size allows
     for increased block occupancy on CUs and better latency
     hiding on global mem ops. */
  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
  dim3 block(std::min(hidden_size, max_block_size));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(
+  /*If the tensor types are FP16/BF16, try to use the optimized kernel
-    input.scalar_type(),
+    with packed + vectorized ops.
-    "fused_add_rms_norm_kernel",
+    Max optimization is achieved with a width-8 vector of FP16/BF16s
-    [&] {
+    since we can load at most 128 bits at once in a global memory op.
-      vllm::fused_add_rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
+    However, this requires each tensor's data to be aligned to 16
-        input.data_ptr<scalar_t>(),
+    bytes.
-        residual.data_ptr<scalar_t>(),
+   */
-        weight.data_ptr<scalar_t>(),
+  auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
-        epsilon,
+  auto res_ptr = reinterpret_cast<std::uintptr_t>(residual.data_ptr());
-        num_tokens,
+  auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
-        hidden_size);
+  bool ptrs_are_aligned = inp_ptr % 16 == 0 && res_ptr % 16 == 0 \
-    });
+                          && wt_ptr % 16 == 0;
  if (ptrs_are_aligned && hidden_size % 8 == 0) {
    LAUNCH_FUSED_ADD_RMS_NORM(8);
  } else {
    LAUNCH_FUSED_ADD_RMS_NORM(0);
  }
 }
--- a/csrc/moe_align_block_size_kernels.cu
+++ b/csrc/moe_align_block_size_kernels.cu
@@ -7,10 +7,17 @@
 #include "cuda_compat.h"
 #include "dispatch_utils.h"
 const static size_t NUM_MAX_EXPERTS = 64;
 #define CEILDIV(x,y) (((x) + (y) - 1) / (y))
 namespace vllm {
 namespace {
 __device__ __forceinline__ int32_t index(int32_t total_col, int32_t row, int32_t col) {
    // don't worry about overflow because num_experts is relatively small
    return row * total_col + col;
 }
 }
 template <typename scalar_t>
 __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids, 
                                int32_t *sorted_token_ids, 
@@ -21,10 +28,14 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids,
                                size_t numel) {
    const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
    const size_t start_idx = threadIdx.x * tokens_per_thread;
-    __shared__ int32_t tokens_cnts[NUM_MAX_EXPERTS + 1][NUM_MAX_EXPERTS];
+
-    __shared__ int32_t cumsum[NUM_MAX_EXPERTS + 1];
+    extern __shared__ int32_t shared_mem[];
    int32_t* tokens_cnts = shared_mem; // 2d tensor with shape (num_experts + 1, num_experts)
    int32_t* cumsum = shared_mem + (num_experts + 1) * num_experts; // 1d tensor with shape (num_experts + 1)
    for (int i = 0; i < num_experts; ++i) {
-        tokens_cnts[threadIdx.x + 1][i] = 0;
+        tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
    }
    /**
@@ -33,15 +44,15 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids,
    * to expert expert_index.
    */
    for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
-        ++tokens_cnts[threadIdx.x + 1][topk_ids[i]]; 
+        ++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])]; 
    }
    __syncthreads();
    // For each expert we accumulate the token counts from the different threads.
-    tokens_cnts[0][threadIdx.x] = 0;
+    tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
    for (int i = 1; i <= blockDim.x; ++i) {
-        tokens_cnts[i][threadIdx.x] += tokens_cnts[i-1][threadIdx.x];
+        tokens_cnts[index(num_experts, i, threadIdx.x)] += tokens_cnts[index(num_experts, i-1, threadIdx.x)];
    }
    __syncthreads();
@@ -50,7 +61,7 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids,
    if (threadIdx.x == 0) {
        cumsum[0] = 0;
        for (int i = 1; i <= num_experts; ++i) {
-            cumsum[i] = cumsum[i-1] + CEILDIV(tokens_cnts[blockDim.x][i - 1], block_size) * block_size;
+            cumsum[i] = cumsum[i-1] + CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)], block_size) * block_size;
        }
        *total_tokens_post_pad = cumsum[num_experts];
    }
@@ -78,9 +89,9 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids,
        * stores the indices of the tokens processed by the expert with expert_id within
        * the current thread's token shard.
        */
-        int32_t rank_post_pad = tokens_cnts[threadIdx.x][expert_id] + cumsum[expert_id];
+        int32_t rank_post_pad = tokens_cnts[index(num_experts, threadIdx.x, expert_id)] + cumsum[expert_id];
        sorted_token_ids[rank_post_pad] = i;
-        ++tokens_cnts[threadIdx.x][expert_id];
+        ++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
    }
 }
 }
@@ -93,11 +104,17 @@ void moe_align_block_size(
    torch::Tensor experts_ids,
    torch::Tensor num_tokens_post_pad) {
    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    assert(num_experts <= NUM_MAX_EXPERTS);
    VLLM_DISPATCH_INTEGRAL_TYPES(
        topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
-        vllm::moe_align_block_size_kernel<scalar_t><<<1, num_experts, 0, stream>>>(
+        // calc needed amount of shared mem for `tokens_cnts` and `cumsum` tensors
-            topk_ids.data_ptr<scalar_t>(), 
+        const int32_t shared_mem = ((num_experts + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t);
        // set dynamic shared mem
        auto kernel = vllm::moe_align_block_size_kernel<scalar_t>;
        AT_CUDA_CHECK(
            VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize((void *)kernel, shared_mem));
        kernel<<<1, num_experts, shared_mem, stream>>>(
            topk_ids.data_ptr<scalar_t>(),
            sorted_token_ids.data_ptr<int32_t>(), 
            experts_ids.data_ptr<int32_t>(), 
            num_tokens_post_pad.data_ptr<int32_t>(), 
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -10,11 +10,12 @@ void paged_attention_v1(
  int num_kv_heads,
  float scale,
  torch::Tensor& block_tables,
-  torch::Tensor& context_lens,
+  torch::Tensor& seq_lens,
  int block_size,
-  int max_context_len,
+  int max_seq_len,
  const c10::optional<torch::Tensor>& alibi_slopes,
-  const std::string& kv_cache_dtype);
+  const std::string& kv_cache_dtype,
  float kv_scale);
 void paged_attention_v2(
  torch::Tensor& out,
@@ -27,11 +28,12 @@ void paged_attention_v2(
  int num_kv_heads,
  float scale,
  torch::Tensor& block_tables,
-  torch::Tensor& context_lens,
+  torch::Tensor& seq_lens,
  int block_size,
-  int max_context_len,
+  int max_seq_len,
  const c10::optional<torch::Tensor>& alibi_slopes,
-  const std::string& kv_cache_dtype);
+  const std::string& kv_cache_dtype,
  float kv_scale);
 void rms_norm(
  torch::Tensor& out,
@@ -53,6 +55,16 @@ void rotary_embedding(
  torch::Tensor& cos_sin_cache,
  bool is_neox);
 void batched_rotary_embedding(
  torch::Tensor& positions,
  torch::Tensor& query,
  torch::Tensor& key,
  int head_size,
  torch::Tensor& cos_sin_cache,
  bool is_neox,
  int rot_dim,
  torch::Tensor& cos_sin_cache_offsets);
 void silu_and_mul(
  torch::Tensor& out,
  torch::Tensor& input);
@@ -61,6 +73,10 @@ void gelu_and_mul(
  torch::Tensor& out,
  torch::Tensor& input);
 void gelu_tanh_and_mul(
  torch::Tensor& out,
  torch::Tensor& input);
 void gelu_new(
  torch::Tensor& out,
  torch::Tensor& input);
@@ -70,6 +86,21 @@ void gelu_fast(
  torch::Tensor& input);
 #ifndef USE_ROCM
 torch::Tensor aqlm_gemm(
  const torch::Tensor& input,
  const torch::Tensor& codes,
  const torch::Tensor& codebooks,
  const torch::Tensor& scales,
  const torch::Tensor& codebook_partition_sizes,
  const std::optional<torch::Tensor>& bias
 );
 torch::Tensor aqlm_dequant(
  const torch::Tensor& codes,
  const torch::Tensor& codebooks,
  const torch::Tensor& codebook_partition_sizes
 );
 torch::Tensor awq_gemm(
  torch::Tensor _in_feats,
  torch::Tensor _kernel,
@@ -93,6 +124,26 @@ torch::Tensor marlin_gemm(
    int64_t size_m, 
    int64_t size_n, 
    int64_t size_k);
 torch::Tensor gptq_marlin_gemm(
  torch::Tensor &a,
  torch::Tensor &b_q_weight,
  torch::Tensor &b_scales,
  torch::Tensor &g_idx,
  torch::Tensor &perm,
  torch::Tensor &workspace,
  int64_t num_bits,
  int64_t size_m,
  int64_t size_n,
  int64_t size_k,
  bool is_k_full);
 torch::Tensor gptq_marlin_repack(
  torch::Tensor &b_q_weight,
  torch::Tensor &perm,
  int64_t size_k,
  int64_t size_n,
  int64_t num_bits);
 #endif
 void squeezellm_gemm(
@@ -115,6 +166,16 @@ void gptq_shuffle(
  torch::Tensor q_perm,
  int bit);
 void static_scaled_fp8_quant(
  torch::Tensor& out,
  torch::Tensor& input,
  torch::Tensor& scale);
 void dynamic_scaled_fp8_quant(
  torch::Tensor& out,
  torch::Tensor& input,
  torch::Tensor& scale);
 void moe_align_block_size(
  torch::Tensor topk_ids,
  int num_experts,
--- a/csrc/pos_encoding_kernels.cu
+++ b/csrc/pos_encoding_kernels.cu
@@ -8,7 +8,7 @@
 namespace vllm {
 template<typename scalar_t, bool IS_NEOX>
-inline __device__ void apply_rotary_embedding(
+inline __device__ void apply_token_rotary_embedding(
  scalar_t* __restrict__ arr,
  const scalar_t* __restrict__ cos_ptr,
  const scalar_t* __restrict__ sin_ptr,
@@ -37,6 +37,42 @@ inline __device__ void apply_rotary_embedding(
  arr[y_index] = y * cos + x * sin;
 }
 template<typename scalar_t, bool IS_NEOX>
 inline __device__ void apply_rotary_embedding(
  scalar_t* __restrict__ query,                 // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size]
  scalar_t* __restrict__ key,                   // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size]
  const scalar_t* cache_ptr,
  const int head_size,
  const int num_heads,
  const int num_kv_heads,
  const int rot_dim,
  const int token_idx,
  const int64_t query_stride,
  const int64_t key_stride)
 {
  const int embed_dim = rot_dim / 2;
  const scalar_t* cos_ptr = cache_ptr;
  const scalar_t* sin_ptr = cache_ptr + embed_dim;
  const int nq = num_heads * embed_dim;
  for (int i = threadIdx.x; i < nq; i += blockDim.x) {
    const int head_idx = i / embed_dim;
    const int64_t token_head = token_idx * query_stride + head_idx * head_size;
    const int rot_offset = i % embed_dim;
    apply_token_rotary_embedding<scalar_t, IS_NEOX>(query + token_head, cos_ptr,
                                              sin_ptr, rot_offset, embed_dim);
  }
  const int nk = num_kv_heads * embed_dim;
  for (int i = threadIdx.x; i < nk; i += blockDim.x) {
    const int head_idx = i / embed_dim;
    const int64_t token_head = token_idx * key_stride + head_idx * head_size;
    const int rot_offset = i % embed_dim;
    apply_token_rotary_embedding<scalar_t, IS_NEOX>(key + token_head, cos_ptr,
                                              sin_ptr, rot_offset, embed_dim);
  }
 }
 template<typename scalar_t, bool IS_NEOX>
 __global__ void rotary_embedding_kernel(
  const int64_t* __restrict__ positions,        // [batch_size, seq_len] or [num_tokens]
@@ -54,27 +90,29 @@ __global__ void rotary_embedding_kernel(
  int64_t pos = positions[token_idx];
  const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
-  const int embed_dim = rot_dim / 2;
+  apply_rotary_embedding<scalar_t, IS_NEOX>(query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, token_idx, query_stride, key_stride);
-  const scalar_t* cos_ptr = cache_ptr;
+}
  const scalar_t* sin_ptr = cache_ptr + embed_dim;
-  const int nq = num_heads * embed_dim;
+template<typename scalar_t, bool IS_NEOX>
-  for (int i = threadIdx.x; i < nq; i += blockDim.x) {
+__global__ void batched_rotary_embedding_kernel(
-    const int head_idx = i / embed_dim;
+  const int64_t* __restrict__ positions,              // [batch_size, seq_len] or [num_tokens]
-    const int64_t token_head = token_idx * query_stride + head_idx * head_size;
+  scalar_t* __restrict__ query,                       // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size]
-    const int rot_offset = i % embed_dim;
+  scalar_t* __restrict__ key,                         // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size]
-    apply_rotary_embedding<scalar_t, IS_NEOX>(query + token_head, cos_ptr,
+  const scalar_t* __restrict__ cos_sin_cache,         // [max_position, 2, rot_dim // 2]
-                                              sin_ptr, rot_offset, embed_dim);
+  const int64_t* __restrict__ cos_sin_cache_offsets,  // [batch_size, seq_len] or [num_tokens]
-  }
+  const int rot_dim,
  const int64_t query_stride,
  const int64_t key_stride,
  const int num_heads,
  const int num_kv_heads,
  const int head_size) {
  // Each thread block is responsible for one token.
  const int token_idx = blockIdx.x;
  int64_t pos = positions[token_idx];
  int64_t cos_sin_cache_offset = cos_sin_cache_offsets[token_idx];
  const scalar_t* cache_ptr = cos_sin_cache + (cos_sin_cache_offset + pos) * rot_dim;
-  const int nk = num_kv_heads * embed_dim;
+  apply_rotary_embedding<scalar_t, IS_NEOX>(query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, token_idx, query_stride, key_stride);
  for (int i = threadIdx.x; i < nk; i += blockDim.x) {
    const int head_idx = i / embed_dim;
    const int64_t token_head = token_idx * key_stride + head_idx * head_size;
    const int rot_offset = i % embed_dim;
    apply_rotary_embedding<scalar_t, IS_NEOX>(key + token_head, cos_ptr,
                                              sin_ptr, rot_offset, embed_dim);
  }
 }
 } // namespace vllm
@@ -128,3 +166,61 @@ void rotary_embedding(
      }
    });
 }
 /*
 Batched version of rotary embedding, pack multiple LoRAs together
 and process in batched manner.
 */
 void batched_rotary_embedding(
  torch::Tensor& positions,         // [batch_size, seq_len] or [num_tokens]
  torch::Tensor& query,             // [batch_size, seq_len, num_heads * head_size] or [num_tokens, num_heads * head_size]
  torch::Tensor& key,               // [batch_size, seq_len, num_kv_heads * head_size] or [num_tokens, num_kv_heads * head_size]
  int head_size,
  torch::Tensor& cos_sin_cache,     // [max_position, rot_dim]
  bool is_neox,
  int rot_dim,
  torch::Tensor& cos_sin_cache_offsets // [num_tokens]
 ) {
  int64_t num_tokens = cos_sin_cache_offsets.size(0);
  int num_heads = query.size(-1) / head_size;
  int num_kv_heads = key.size(-1) / head_size;
  int64_t query_stride = query.stride(-2);
  int64_t key_stride = key.stride(-2);
  dim3 grid(num_tokens);
  dim3 block(std::min(num_heads * rot_dim / 2, 512));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(
    query.scalar_type(),
    "rotary_embedding",
    [&] {
      if (is_neox) {
        vllm::batched_rotary_embedding_kernel<scalar_t, true><<<grid, block, 0, stream>>>(
          positions.data_ptr<int64_t>(),
          query.data_ptr<scalar_t>(),
          key.data_ptr<scalar_t>(),
          cos_sin_cache.data_ptr<scalar_t>(),
          cos_sin_cache_offsets.data_ptr<int64_t>(),
          rot_dim,
          query_stride,
          key_stride,
          num_heads,
          num_kv_heads,
          head_size);
      } else {
        vllm::batched_rotary_embedding_kernel<scalar_t, false><<<grid, block, 0, stream>>>(
          positions.data_ptr<int64_t>(),
          query.data_ptr<scalar_t>(),
          key.data_ptr<scalar_t>(),
          cos_sin_cache.data_ptr<scalar_t>(),
          cos_sin_cache_offsets.data_ptr<int64_t>(),
          rot_dim,
          query_stride,
          key_stride,
          num_heads,
          num_kv_heads,
          head_size);
      }
    });
 }
--- a/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu
+++ b/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu
@@ -2,3 +2,4 @@
 #include "bgmv_impl.cuh"
 FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16)
 FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16)
--- a/csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu
+++ b/csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu
@@ -1,4 +0,0 @@
 #include "bgmv_config.h"
 #include "bgmv_impl.cuh"
 FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_half)
--- a/csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu
+++ b/csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu
@@ -1,4 +0,0 @@
 #include "bgmv_config.h"
 #include "bgmv_impl.cuh"
 FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_half, nv_bfloat16)
--- a/csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu
+++ b/csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu
@@ -1,4 +0,0 @@
 #include "bgmv_config.h"
 #include "bgmv_impl.cuh"
 FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_half, nv_half)
--- a/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu
+++ b/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu
@@ -2,3 +2,4 @@
 #include "bgmv_impl.cuh"
 FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_bfloat16)
 FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_bfloat16, float, nv_bfloat16)
--- a/csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu
+++ b/csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu
@@ -1,4 +0,0 @@
 #include "bgmv_config.h"
 #include "bgmv_impl.cuh"
 FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_half)
--- a/csrc/punica/bgmv/bgmv_config.h
+++ b/csrc/punica/bgmv/bgmv_config.h
@@ -14,21 +14,29 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
    f(in_T, out_T, W_T, narrow, 128) \
    f(in_T, out_T, W_T, narrow, 256) \
    f(in_T, out_T, W_T, narrow, 512) \
    f(in_T, out_T, W_T, narrow, 640) \
    f(in_T, out_T, W_T, narrow, 768) \
    f(in_T, out_T, W_T, narrow, 1024) \
    f(in_T, out_T, W_T, narrow, 1152) \
    f(in_T, out_T, W_T, narrow, 1280) \
    f(in_T, out_T, W_T, narrow, 1536) \
    f(in_T, out_T, W_T, narrow, 1728) \
    f(in_T, out_T, W_T, narrow, 1792) \
    f(in_T, out_T, W_T, narrow, 2048) \
    f(in_T, out_T, W_T, narrow, 2304) \
    f(in_T, out_T, W_T, narrow, 2560) \
    f(in_T, out_T, W_T, narrow, 2752) \
    f(in_T, out_T, W_T, narrow, 2816) \
    f(in_T, out_T, W_T, narrow, 3072) \
    f(in_T, out_T, W_T, narrow, 3456) \
    f(in_T, out_T, W_T, narrow, 3584) \
    f(in_T, out_T, W_T, narrow, 4096) \
    f(in_T, out_T, W_T, narrow, 4608) \
    f(in_T, out_T, W_T, narrow, 5120) \
    f(in_T, out_T, W_T, narrow, 5504) \
    f(in_T, out_T, W_T, narrow, 5632) \
    f(in_T, out_T, W_T, narrow, 6144) \
    f(in_T, out_T, W_T, narrow, 6848) \
    f(in_T, out_T, W_T, narrow, 6912) \
    f(in_T, out_T, W_T, narrow, 7168) \
    f(in_T, out_T, W_T, narrow, 8192) \
@@ -36,11 +44,15 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
    f(in_T, out_T, W_T, narrow, 10240) \
    f(in_T, out_T, W_T, narrow, 11008) \
    f(in_T, out_T, W_T, narrow, 12288) \
    f(in_T, out_T, W_T, narrow, 13696) \
    f(in_T, out_T, W_T, narrow, 13824) \
    f(in_T, out_T, W_T, narrow, 14336) \
    f(in_T, out_T, W_T, narrow, 15360) \
    f(in_T, out_T, W_T, narrow, 16384) \
    f(in_T, out_T, W_T, narrow, 20480) \
    f(in_T, out_T, W_T, narrow, 22016) \
    f(in_T, out_T, W_T, narrow, 24576) \
    f(in_T, out_T, W_T, narrow, 27392) \
    f(in_T, out_T, W_T, narrow, 28672) \
    f(in_T, out_T, W_T, narrow, 32000) \
    f(in_T, out_T, W_T, narrow, 32256) \
@@ -48,9 +60,88 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
    f(in_T, out_T, W_T, narrow, 32768) \
    f(in_T, out_T, W_T, narrow, 33024) \
    f(in_T, out_T, W_T, narrow, 36864) \
    f(in_T, out_T, W_T, narrow, 43264) \
    f(in_T, out_T, W_T, narrow, 49152) \
    f(in_T, out_T, W_T, narrow, 64000) \
    f(in_T, out_T, W_T, narrow, 64256) \
    f(in_T, out_T, W_T, narrow, 64512) \
    f(in_T, out_T, W_T, narrow, 102400) \
    f(in_T, out_T, W_T, narrow, 102656) \
    f(in_T, out_T, W_T, narrow, 102912) \
    f(in_T, out_T, W_T, narrow, 128000) \
    f(in_T, out_T, W_T, narrow, 128256) \
    f(in_T, out_T, W_T, narrow, 128512) \
 // Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA
 // and vllm/tests/lora/test_punica.py
 // Used for defining kernels going from the variety of 
 // dim in to the narrow dim out
    // Using it for the fully sharded column 
    // parallel LoRA A which splits the rank dim
 #define FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, narrow) \
    f(in_T, out_T, W_T, 128, narrow) \
    f(in_T, out_T, W_T, 256, narrow) \
    f(in_T, out_T, W_T, 512, narrow) \
    f(in_T, out_T, W_T, 640, narrow) \
    f(in_T, out_T, W_T, 768, narrow) \
    f(in_T, out_T, W_T, 1024, narrow) \
    f(in_T, out_T, W_T, 1152, narrow) \
    f(in_T, out_T, W_T, 1280, narrow) \
    f(in_T, out_T, W_T, 1536, narrow) \
    f(in_T, out_T, W_T, 1728, narrow) \
    f(in_T, out_T, W_T, 1792, narrow) \
    f(in_T, out_T, W_T, 2048, narrow) \
    f(in_T, out_T, W_T, 2304, narrow) \
    f(in_T, out_T, W_T, 2560, narrow) \
    f(in_T, out_T, W_T, 2752, narrow) \
    f(in_T, out_T, W_T, 2816, narrow) \
    f(in_T, out_T, W_T, 3072, narrow) \
    f(in_T, out_T, W_T, 3456, narrow) \
    f(in_T, out_T, W_T, 3584, narrow) \
    f(in_T, out_T, W_T, 4096, narrow) \
    f(in_T, out_T, W_T, 4608, narrow) \
    f(in_T, out_T, W_T, 5120, narrow) \
    f(in_T, out_T, W_T, 5504, narrow) \
    f(in_T, out_T, W_T, 5632, narrow) \
    f(in_T, out_T, W_T, 6144, narrow) \
    f(in_T, out_T, W_T, 6848, narrow) \
    f(in_T, out_T, W_T, 6912, narrow) \
    f(in_T, out_T, W_T, 7168, narrow) \
    f(in_T, out_T, W_T, 8192, narrow) \
    f(in_T, out_T, W_T, 9216, narrow) \
    f(in_T, out_T, W_T, 10240, narrow) \
    f(in_T, out_T, W_T, 11008, narrow) \
    f(in_T, out_T, W_T, 12288, narrow) \
    f(in_T, out_T, W_T, 13696, narrow) \
    f(in_T, out_T, W_T, 13824, narrow) \
    f(in_T, out_T, W_T, 14336, narrow) \
    f(in_T, out_T, W_T, 15360, narrow) \
    f(in_T, out_T, W_T, 16384, narrow) \
    f(in_T, out_T, W_T, 20480, narrow) \
    f(in_T, out_T, W_T, 22016, narrow) \
    f(in_T, out_T, W_T, 24576, narrow) \
    f(in_T, out_T, W_T, 27392, narrow) \
    f(in_T, out_T, W_T, 28672, narrow) \
    f(in_T, out_T, W_T, 32000, narrow) \
    f(in_T, out_T, W_T, 32256, narrow) \
    f(in_T, out_T, W_T, 32512, narrow) \
    f(in_T, out_T, W_T, 32768, narrow) \
    f(in_T, out_T, W_T, 33024, narrow) \
    f(in_T, out_T, W_T, 36864, narrow) \
    f(in_T, out_T, W_T, 43264, narrow) \
    f(in_T, out_T, W_T, 49152, narrow) \
    f(in_T, out_T, W_T, 64000, narrow) \
    f(in_T, out_T, W_T, 64256, narrow) \
    f(in_T, out_T, W_T, 64512, narrow) \
    f(in_T, out_T, W_T, 102400, narrow) \
    f(in_T, out_T, W_T, 102656, narrow) \
    f(in_T, out_T, W_T, 102912, narrow) \
    f(in_T, out_T, W_T, 128000, narrow) \
    f(in_T, out_T, W_T, 128256, narrow) \
    f(in_T, out_T, W_T, 128512, narrow) \
 // Keep above in sync with vllm/lora/layers::SamplerWithLoRA
 // Keep this in sync with vllm/config::LoRAConfig
 #define FOR_BGMV_WIDE_NARROW(f, in_T, out_T, W_T) \
    FOR_BGMV_WIDE(f, in_T, out_T, W_T, 8)  \
@@ -58,4 +149,14 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
    FOR_BGMV_WIDE(f, in_T, out_T, W_T, 32) \
    FOR_BGMV_WIDE(f, in_T, out_T, W_T, 64)
 #define FOR_INST_BGMV_WIDE_NARROW(f, in_T, out_T, W_T) \
    FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 1) \
    FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 2) \
    FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 4) \
    f(in_T, out_T, W_T, 8, 64) \
    f(in_T, out_T, W_T, 16, 64) \
    f(in_T, out_T, W_T, 32, 64) \
    f(in_T, out_T, W_T, 64, 64)
 // clang-format on
--- a/csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu
+++ b/csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu
@@ -1,4 +0,0 @@
 #include "bgmv_config.h"
 #include "bgmv_impl.cuh"
 FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_bfloat16, nv_bfloat16)
--- a/csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu
+++ b/csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu
@@ -1,4 +0,0 @@
 #include "bgmv_config.h"
 #include "bgmv_impl.cuh"
 FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_bfloat16, nv_half)
--- a/csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu
+++ b/csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu
@@ -1,4 +0,0 @@
 #include "bgmv_config.h"
 #include "bgmv_impl.cuh"
 FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_bfloat16)
--- a/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu
+++ b/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu
@@ -2,3 +2,4 @@
 #include "bgmv_impl.cuh"
 FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_half)
 FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, nv_half, nv_half)
--- a/csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu
+++ b/csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu
@@ -1,4 +0,0 @@
 #include "bgmv_config.h"
 #include "bgmv_impl.cuh"
 FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_bfloat16)
--- a/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu
+++ b/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu
@@ -2,3 +2,4 @@
 #include "bgmv_impl.cuh"
 FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_half)
 FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, float, nv_half)
--- a/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu
+++ b/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu
@@ -2,3 +2,4 @@
 #include "bgmv_impl.cuh"
 FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_bfloat16)
 FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, nv_bfloat16, nv_bfloat16)
--- a/csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu
+++ b/csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu
@@ -1,4 +0,0 @@
 #include "bgmv_config.h"
 #include "bgmv_impl.cuh"
 FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_half)
--- a/csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu
+++ b/csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu
@@ -1,4 +0,0 @@
 #include "bgmv_config.h"
 #include "bgmv_impl.cuh"
 FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_bfloat16)
--- a/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu
+++ b/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu
@@ -2,3 +2,4 @@
 #include "bgmv_impl.cuh"
 FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_half)
 FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, nv_half, nv_half)
--- a/csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu
+++ b/csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu
@@ -1,4 +0,0 @@
 #include "bgmv_config.h"
 #include "bgmv_impl.cuh"
 FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, float, nv_bfloat16)
--- a/csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu
+++ b/csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu
@@ -1,4 +0,0 @@
 #include "bgmv_config.h"
 #include "bgmv_impl.cuh"
 FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, float, nv_half)
--- a/csrc/punica/bgmv/bgmv_impl.cuh
+++ b/csrc/punica/bgmv/bgmv_impl.cuh
@@ -199,7 +199,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
  constexpr int tz = 4;
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  if constexpr (feat_in < feat_out) {
+  if constexpr (feat_in <= feat_out) {
    static_assert(feat_in % vec_size == 0);
    constexpr int tx = feat_in / vec_size;
@@ -289,6 +289,9 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
      int64_t y_offset, int64_t full_y_size, int64_t batch_size,               \
      int64_t num_layers, int64_t layer_idx, float scale);
 #define INST_BGMV_ONESIDE(in_T, out_T, W_T, feat_in, feat_out)                 \
  INST_BGMV(feat_in, feat_out, in_T, out_T, W_T)
 #define INST_BGMV_TWOSIDE(in_T, out_T, W_T, narrow, wide)                      \
  INST_BGMV(narrow, wide, in_T, out_T, W_T)                                    \
  INST_BGMV(wide, narrow, in_T, out_T, W_T)
--- a/csrc/punica/bgmv/generator.py
+++ b/csrc/punica/bgmv/generator.py
@@ -10,7 +10,8 @@ TEMPLATE = """
 #include "bgmv_impl.cuh"
 FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, {input_dtype}, {output_dtype}, {weight_dtype})
-""".lstrip()
+FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, {input_dtype}, {output_dtype}, {weight_dtype})
 """.lstrip()  # noqa: E501
 for input_dtype in DTYPES:
    for output_dtype in DTYPES:
@@ -18,6 +19,26 @@ for input_dtype in DTYPES:
            if weight_dtype == "fp32":
                # FP32 weights are not supported.
                continue
            if output_dtype == "fp32":
                # LoRA A matrix.
                if input_dtype != weight_dtype:
                    # NOTE(woosuk): While Punica supports the case where the
                    # input and weight dtypes are different, we only generate
                    # the kernels the same dtypes to reduce the binary size.
                    continue
            elif input_dtype == "fp32":
                # LoRA B matrix.
                if output_dtype != weight_dtype:
                    # NOTE(woosuk): While Punica supports the case where the
                    # output and weight dtypes are different, we only generate
                    # the kernels the same dtypes to reduce the binary size.
                    continue
            elif not (input_dtype == output_dtype == weight_dtype):
                # NOTE(woosuk): While Punica supports mixed data types for
                # input, output, and weight, we only generate the kernels with
                # the same data types to reduce the binary size.
                continue
            kernel_definition = TEMPLATE.format(
                input_dtype=DTYPE_MAP[input_dtype],
                output_dtype=DTYPE_MAP[output_dtype],
--- a/csrc/punica/punica_ops.cc
+++ b/csrc/punica/punica_ops.cc
@@ -1,7 +1,7 @@
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #include <torch/extension.h>
-
+#include <c10/cuda/CUDAGuard.h>
 #include <cstdint>
 #include "bgmv/bgmv_config.h"
@@ -20,8 +20,8 @@ inline void check_shape(const torch::Tensor &a, const torch::Tensor &b,
  }
 }
-inline constexpr uint32_t pack_u16(uint16_t a, uint16_t b) {
+inline constexpr uint64_t pack_u32(uint32_t a, uint32_t b) {
-  return (uint32_t(a) << 16) | uint32_t(b);
+  return (uint64_t(a) << 32) | uint64_t(b);
 }
 #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
@@ -46,13 +46,30 @@ inline constexpr uint32_t pack_u16(uint16_t a, uint16_t b) {
 template <typename in_T, typename out_T, typename W_T>
 inline bool launch_bgmv_kernel(out_T *Y, const in_T *X, const W_T *W,
                               const int64_t *lora_indices,
-                               uint16_t in_features, uint16_t out_features,
+                               uint32_t in_features, uint32_t out_features,
                               int64_t y_offset, int64_t full_y_size,
                               int64_t batch_size, int64_t num_layers,
                               int64_t layer_idx, float scale) {
-  switch (pack_u16(in_features, out_features)) {
+  // NOTE(woosuk): While Punica supports various combinations of input/output
  // data types, we limit the supported data types to reduce the binary size.
  constexpr bool is_input_float = std::is_same<in_T, float>::value;
  constexpr bool is_output_float = std::is_same<out_T, float>::value;
  if (is_input_float) {
    if (!std::is_same<out_T, W_T>::value) {
      return false;
    }
  } else if (is_output_float) {
    if (!std::is_same<in_T, W_T>::value) {
      return false;
    }
  } else if (!(std::is_same<in_T, W_T>::value &&
               std::is_same<out_T, W_T>::value)) {
    return false;
  }
  switch (pack_u32(in_features, out_features)) {
 #define CASE_ONESIDE(_in_T, _out_T, _W_T, feat_in, feat_out)                   \
-  case pack_u16(feat_in, feat_out):                                            \
+  case pack_u32(feat_in, feat_out):                                            \
    bgmv_kernel<feat_in, feat_out>(Y, X, W, lora_indices, y_offset,            \
                                   full_y_size, batch_size, num_layers,        \
                                   layer_idx, scale);                          \
@@ -62,12 +79,12 @@ inline bool launch_bgmv_kernel(out_T *Y, const in_T *X, const W_T *W,
  CASE_ONESIDE(in_T, out_T, W_T, wide, narrow)
    FOR_BGMV_WIDE_NARROW(CASE, _, _, _)
    FOR_INST_BGMV_WIDE_NARROW(CASE_ONESIDE, _, _, _)
 #undef CASE
 #undef CASE_ONESIDE
  default:
    return false;
  }
  return true;
 }
@@ -91,8 +108,9 @@ void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
  CHECK_EQ(w.size(2), h_out);
  CHECK_EQ(indicies.size(0), x.size(0));
  CHECK_EQ(y.size(0), x.size(0));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
  bool ok = false;
-  if (h_in < 65536 && h_out < 65536) {
+  if (h_in <= 128512 && h_out <= 128512) {
    // TODO: See if we can get rid of this massive nested switch
    switch (x.scalar_type()) {
    case at::ScalarType::Half:
@@ -322,8 +340,9 @@ void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w,
  CHECK_EQ(w.size(2), h_out);
  CHECK_EQ(indicies.size(0), x.size(0));
  CHECK_EQ(y.size(0), x.size(0));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
  bool ok = false;
-  if (h_in < 65536 && h_out < 65536) {
+  if (h_in <= 128512 && h_out <= 128512) {
    // TODO: See if we can get rid of this massive nested switch
    switch (x.scalar_type()) {
    case at::ScalarType::Half:
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -25,7 +25,11 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  ops.def(
    "gelu_and_mul",
    &gelu_and_mul,
-    "Activation function used in GeGLU.");
+    "Activation function used in GeGLU with `none` approximation.");
  ops.def(
    "gelu_tanh_and_mul",
    &gelu_tanh_and_mul,
    "Activation function used in GeGLU with `tanh` approximation.");
  ops.def(
    "gelu_new",
    &gelu_new,
@@ -52,16 +56,27 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    &rotary_embedding,
    "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
  ops.def(
    "batched_rotary_embedding",
    &batched_rotary_embedding,
    "Apply GPT-NeoX or GPT-J style rotary embedding to query and key (supports multiple loras)");
 // Quantization ops
 #ifndef USE_ROCM
  ops.def("aqlm_gemm", &aqlm_gemm, "Quantized GEMM for AQLM");
  ops.def("aqlm_dequant", &aqlm_dequant, "Decompression method for AQLM");
  ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
  ops.def("marlin_gemm", &marlin_gemm, "Marlin Optimized Quantized GEMM for GPTQ");
  ops.def("gptq_marlin_gemm", &gptq_marlin_gemm, "gptq_marlin Optimized Quantized GEMM for GPTQ");
  ops.def("gptq_marlin_repack", &gptq_marlin_repack, "gptq_marlin repack from GPTQ");
  ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ");
 #endif
  ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
  ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
  ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
  ops.def("static_scaled_fp8_quant", &static_scaled_fp8_quant, "Compute FP8 quantized tensor for given scaling factor");
  ops.def("dynamic_scaled_fp8_quant", &dynamic_scaled_fp8_quant, "Compute FP8 quantized tensor and scaling factor");
  ops.def(
    "moe_align_block_size",
    &moe_align_block_size,
@@ -82,9 +97,13 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    &reshape_and_cache,
    "Reshape the key and value tensors and cache them");
  cache_ops.def(
-    "convert_fp8_e5m2",
+    "reshape_and_cache_flash",
-    &convert_fp8_e5m2,
+    &reshape_and_cache_flash,
-    "Convert the key and value cache to fp8_e5m2 data type");
+    "Reshape the key and value tensors and cache them");
  cache_ops.def(
    "convert_fp8",
    &convert_fp8,
    "Convert the key and value cache to fp8 data type");
  // Cuda utils
  pybind11::module cuda_utils = m.def_submodule("cuda_utils", "vLLM cuda utils");
--- a/csrc/quantization/aqlm/gemm_kernels.cu
+++ b/csrc/quantization/aqlm/gemm_kernels.cu
@@ -0,0 +1,712 @@
 /*
 * Modified by Neural Magic
 * Adapted from https://github.com/Vahe1994/AQLM
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
 #include <torch/extension.h>
 #include <c10/cuda/CUDAStream.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <iostream>
 #include <cstdlib>
 namespace vllm {
 namespace aqlm {
 __global__ void Code1x16MatVec(
  const int4* __restrict__ A,
  const int4* __restrict__ B,
        int4* __restrict__ C,
  const int4* __restrict__ codebook,
  const int prob_m,
  const int prob_k,
  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long.
  const int codebook_stride // as int4.
 ) {
  int a_gl_stride = prob_k / 8 / 8;
  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
  bool pred = a_gl_rd < prob_m;
  if (pred)
  {
    // advance to the correct codebook, this easy because we only multiply one column of the codebook.
    auto codebook_size = &codebook_a_sizes.x;
    while (a_gl_rd >= *codebook_size)
    {
        codebook += codebook_stride;
        ++codebook_size;
    }
  }
  int b_gl_rd = 0;
  int c_gl_wr = a_gl_rd;
  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
  __shared__ int4 sh_b[32 * 9];
  float res = 0;
  int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32);
  while (iters--) {
    // We pad shared memory to avoid bank conflicts during reads
    __syncthreads();
    for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) {
      if (b_gl_rd + i < prob_k / 8)
        sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
    }
    __syncthreads();
    b_gl_rd += 32 * 8;
    int b_sh_rd = 9 * (threadIdx.x % 32);
    if (pred && a_gl_rd < a_gl_end) {
      const uint16_t* enc = reinterpret_cast<const uint16_t*>(&A[a_gl_rd]);
      #pragma unroll
      for (int i = 0; i < 8; i++) {
        uint32_t dec[4];
        // We bypass the L1 cache to avoid massive amounts of memory streaming that doesn't
        // actually help us; this brings > 2x speedup.
        asm volatile (
          "ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
          : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
          : "l"((void*) &codebook[enc[i]])
        );
        half2* a = reinterpret_cast<half2*>(&dec);
        half2* b = reinterpret_cast<half2*>(&sh_b[b_sh_rd]);
        half2 res2 = {};
        #pragma unroll
        for (int j = 0; j < 4; j++)
          res2 = __hfma2(a[j], b[j], res2);
        res += __half2float(res2.x) + __half2float(res2.y);
        b_sh_rd++;
      }
      a_gl_rd += 32;
    }
  }
  if (pred) {
    #pragma unroll
    for (int i = 16; i > 0; i /= 2)
      res += __shfl_down_sync(0xffffffff, res, i);
    if (threadIdx.x % 32 == 0)
      reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res);
  }
 }
 __global__ void Code2x8MatVec(
  const int4* __restrict__ A,
  const int4* __restrict__ B,
        int4* __restrict__ C,
  const int4* __restrict__ codebook,
  int prob_m,
  int prob_k,
  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long.
  const int codebook_stride // as int4.
 ) {
  int a_gl_stride = prob_k / 8 / 8;
  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
  bool pred = a_gl_rd < prob_m;
  if (pred)
  {
    // advance to the correct codebook, this easy because we only multiply one column of the codebook.
    auto codebook_size = &codebook_a_sizes.x;
    while (a_gl_rd >= *codebook_size)
    {
        codebook += codebook_stride;
        ++codebook_size;
    }
  }
  int b_gl_rd = 0;
  int c_gl_wr = a_gl_rd;
  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
  int lane = threadIdx.x % 8;
  extern __shared__ int4 sh[];
  int4* sh_b = sh;
  int4* sh_code = sh_b + 32 * 9;
  int4* sh_code0 = sh_code;
  int4* sh_code1 = sh_code + 256 * 8;
  for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) {
    int4 dec = codebook[i];
    #pragma unroll
    for (int j = 0; j < 8; j++)
      sh_code[8 * i + (j + lane) % 8] = dec;
  }
  __syncthreads();
  float res = 0;
  int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32);
  while (iters--) {
    // We pad shared memory to avoid bank conflicts during reads
    __syncthreads();
    for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) {
      if (b_gl_rd + i < prob_k / 8)
        sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
    }
    __syncthreads();
    b_gl_rd += 32 * 8;
    int b_sh_rd = 9 * (threadIdx.x % 32);
    if (pred && a_gl_rd < a_gl_end) {
      const uint8_t* enc = reinterpret_cast<const uint8_t*>(&A[a_gl_rd]);
      #pragma unroll
      for (int i = 0; i < 8; i++) {
        half2* a0 = reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
        half2* a1 = reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
        half2*  b = reinterpret_cast<half2*>(&sh_b[b_sh_rd]);
        half2 res2 = {};
        #pragma unroll
        for (int j = 0; j < 4; j++)
          res2 = __hfma2(__hadd2(a0[j], a1[j]), b[j], res2);
        res += __half2float(res2.x) + __half2float(res2.y);
        b_sh_rd++;
      }
      a_gl_rd += 32;
    }
  }
  if (pred) {
    #pragma unroll
    for (int i = 16; i > 0; i /= 2)
      res += __shfl_down_sync(0xffffffff, res, i);
    if (threadIdx.x % 32 == 0)
      reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res);
  }
 }
 __global__ void Code1x16Dequant(
  const int4* __restrict__ A,
        int4* __restrict__ C,
  const int4* __restrict__ codebook,
  int prob_m,
  int prob_k,
  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, sums to m.
  const int codebook_stride // as int4
 ) {
  int a_gl_stride = prob_k / 8 / 8;
  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
  bool pred = a_gl_rd < prob_m;
  if (pred)
  {
    // advance to the correct codebook, this easy because we only multiply one column of the codebook.
    auto codebook_size = &codebook_a_sizes.x;
    while (a_gl_rd >= *codebook_size)
    {
        codebook += codebook_stride;
        ++codebook_size;
    }
  }
  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
  int c_gl_stride = prob_k / 8;
  int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
  c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8;
  int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
  while (iters--) {
    if (pred && a_gl_rd < a_gl_end) {
      const uint16_t* enc = reinterpret_cast<const uint16_t*>(&A[a_gl_rd]);
      #pragma unroll
      for (int i = 0; i < 8; i++) {
        int4 chunk;
        auto dec = reinterpret_cast<uint32_t*>(&chunk);
        // We bypass the L1 cache to avoid massive amounts of memory streaming that doesn't
        // actually help us; this brings > 2x speedup.
        asm volatile (
          "ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
          : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
          : "l"((void*) &codebook[enc[i]])
        );
        C[a_gl_rd * 8 + i] = chunk;
      }
    }
    a_gl_rd += 32;
  }
 }
 __global__ void Code2x8Dequant(
  const int4* __restrict__ A,
        int4* __restrict__ C,
  const int4* __restrict__ codebook,
  int prob_m,
  int prob_k,
  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols.
  const int codebook_stride // as int4
 ) {
  int a_gl_stride = prob_k / 8 / 8;
  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
  bool pred = a_gl_rd < prob_m;
  if (pred)
  {
    // advance to the correct codebook, this easy because we only multiply one column of the codebook.
    auto codebook_size = &codebook_a_sizes.x;
    while (a_gl_rd >= *codebook_size)
    {
        codebook += codebook_stride;
        ++codebook_size;
    }
  }
  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
  int lane = threadIdx.x % 8;
  int c_gl_stride = prob_k / 8;
  int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
  c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8;
  extern __shared__ int4 sh[];
  int4* sh_code = sh;
  int4* sh_code0 = sh_code;
  int4* sh_code1 = sh_code + 256 * 8;
  for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) {
    int4 dec = codebook[i];
    #pragma unroll
    for (int j = 0; j < 8; j++)
      sh_code[8 * i + (j + lane) % 8] = dec;
  }
  __syncthreads();
  float res = 0;
  int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
  while (iters--) {
    if (pred && a_gl_rd < a_gl_end) {
      const uint8_t* enc = reinterpret_cast<const uint8_t*>(&A[a_gl_rd]);
      #pragma unroll
      for (int i = 0; i < 8; i++) {
        int4 chunk;
        half2* a0 = reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
        half2* a1 = reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
        #pragma unroll
        for (int j = 0; j < 4; j++)
          reinterpret_cast<half2*>(&chunk)[j] = __hadd2(a0[j], a1[j]);
        C[a_gl_rd * 8 + i] = chunk;
      }
    }
    a_gl_rd += 32;
  }
 }
 inline int ceildiv(int a, int b) {
  return (a + b - 1) / b;
 }
 const int THREAD_M = 16;
 void  code1x16_matvec_cuda(
  const void* __restrict__ A,
  const void* __restrict__ B,
        void* __restrict__ C,
  const void* __restrict__ codebook,
  int prob_m,
  int prob_k,
  const int4 codebook_a_sizes,
  const int codebook_stride
 ) {
  int sms;
  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
  int waves = 0;
  int thread_m;
  do {
    waves++;
    thread_m = ceildiv(prob_m, waves * sms);
  } while (thread_m > THREAD_M);
  int blocks = ceildiv(prob_m, thread_m);
  int threads = 32 * thread_m;
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  Code1x16MatVec<<<blocks, threads, 16*32*9, stream>>>(
    (const int4*) A,
    (const int4*) B,
    (int4*) C,
    (const int4*) codebook,
    prob_m,
    prob_k,
    codebook_a_sizes,
    codebook_stride
  );
 }
 void  code2x8_matvec_cuda(
  const void* __restrict__ A,
  const void* __restrict__ B,
        void* __restrict__ C,
  const void* __restrict__ codebook,
  int prob_m,
  int prob_k,
  const int4 codebook_a_sizes,
  const int codebook_stride
 ) {
  int sms;
  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
  int waves = 0;
  int thread_m;
  do {
    waves++;
    thread_m = ceildiv(prob_m, waves * sms);
  } while (thread_m > THREAD_M);
  int blocks = ceildiv(prob_m, thread_m);
  int threads = 32 * thread_m;
  int shared = 16 * (2 * 256 * 8 + 32 * 9);
  cudaFuncSetAttribute(
    Code2x8MatVec, cudaFuncAttributeMaxDynamicSharedMemorySize, shared
  );
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  Code2x8MatVec<<<blocks, threads, shared, stream>>>(
    (const int4*) A,
    (const int4*) B,
    (int4*) C,
    (const int4*) codebook,
    prob_m,
    prob_k,
    codebook_a_sizes,
    codebook_stride
  );
 }
 void code1x16_dequant_cuda(
  const void* __restrict__ A,
        void* __restrict__ C,
  const void* __restrict__ codebook,
  int prob_m,
  int prob_k,
  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long.
  const int codebook_stride // as int4.
 ) {
  int sms;
  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
  int waves = 0;
  int thread_m;
  do {
    waves++;
    thread_m = ceildiv(prob_m, waves * sms);
  } while (thread_m > THREAD_M);
  int blocks = ceildiv(prob_m, thread_m);
  int threads = 32 * thread_m;
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  Code1x16Dequant<<<blocks, threads, 0, stream>>>(
    (const int4*) A,
    (int4*) C,
    (const int4*) codebook,
    prob_m,
    prob_k,
    codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long.
    codebook_stride // as int4.
  );
 }
 // Dequantizes the code and codebook into weights.
 void  code2x8_dequant_cuda(
  const void* __restrict__ A,
        void* __restrict__ C,
  const void* __restrict__ codebook,
  int prob_m,
  int prob_k,
  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols.
  const int codebook_stride // as int4
 ) {
  int sms;
  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
  int waves = 0;
  int thread_m;
  do {
    waves++;
    thread_m = ceildiv(prob_m, waves * sms);
  } while (thread_m > THREAD_M);
  int blocks = ceildiv(prob_m, thread_m);
  int threads = 32 * thread_m;
  int shared = 16 * (2 * 256 * 8 + 32 * 9);
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
  cudaFuncSetAttribute(
    Code2x8Dequant, cudaFuncAttributeMaxDynamicSharedMemorySize, shared
  );
  Code2x8Dequant<<<blocks, threads, shared, stream>>>(
    (const int4*) A,
    (int4*) C,
    (const int4*) codebook,
    prob_m,
    prob_k,
    codebook_a_sizes,
    codebook_stride
  );
 }
 int codebook_stride(const torch::Tensor& codebooks)
 {
  return codebooks.stride(0) * codebooks.element_size() / sizeof(int4);
 }
 void code1x16_matvec(
  const torch::Tensor& A,
  const torch::Tensor& B,
        torch::Tensor& C,
  const torch::Tensor& codebook,
  const int4 codebook_a_sizes  // cumulative sizes of A spanning each codebook, at most 3 long.
 ) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
  int prob_m = C.size(0);
  int prob_k = B.size(0);
  code1x16_matvec_cuda(
    A.data_ptr(),
    B.data_ptr(),
    C.data_ptr(),
    codebook.data_ptr(),
    prob_m,
    prob_k,
    codebook_a_sizes,
    codebook_stride(codebook)
  );
 }
 torch::Tensor code1x16_matmat(
  const torch::Tensor& input,
  const torch::Tensor& codes,
  const torch::Tensor& codebooks,
  const torch::Tensor& scales,
  const int4 codebook_a_sizes,
  const std::optional<torch::Tensor>& bias) {
  auto input_sizes = input.sizes();
  auto out_features = codes.size(0) * codebooks.size(2);
  auto flat_input = input.reshape({-1, input.size(-1)});
  auto flat_output = torch::empty({flat_input.size(0), out_features},
    torch::TensorOptions()
      .dtype(input.dtype())
      .device(input.device())
  );
  for (int i = 0; i < flat_input.size(0); ++i) {
    auto input_vec = flat_input.index({i});
    auto output_vec = flat_output.index({i});
    code1x16_matvec(
      codes.squeeze(2),
      input_vec,
      output_vec,
      codebooks,
      codebook_a_sizes
    );
  }
  flat_output *= scales.flatten().unsqueeze(0);
  if (bias.has_value()) {
    flat_output += bias->unsqueeze(0);
  }
  auto output_sizes = input_sizes.vec();
  output_sizes.pop_back();
  output_sizes.push_back(-1);
  auto output = flat_output.reshape(output_sizes);
  return output;
 }
 void code2x8_matvec(
  const torch::Tensor& A,
  const torch::Tensor& B,
        torch::Tensor& C,
  const torch::Tensor& codebook,
  const int4 codebook_a_sizes
 ) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
  int prob_m = C.size(0);
  int prob_k = B.size(0);
  code2x8_matvec_cuda(
    A.data_ptr(),
    B.data_ptr(),
    C.data_ptr(),
    codebook.data_ptr(),
    prob_m,
    prob_k,
    codebook_a_sizes,
    2 * codebook_stride(codebook)
  );
 }
 torch::Tensor code2x8_matmat(
  const torch::Tensor& input,
  const torch::Tensor& codes,
  const torch::Tensor& codebooks,
  const torch::Tensor& scales,
  const int4 codebook_a_sizes,
  const std::optional<torch::Tensor>& bias
 ) {
  auto input_sizes = input.sizes();
  auto out_features = codes.size(0) * codebooks.size(2);
  auto flat_input = input.reshape({-1, input.size(-1)});
  auto flat_output = torch::empty({flat_input.size(0), out_features},
    torch::TensorOptions()
      .dtype(input.dtype())
      .device(input.device())
  );
  for (int i = 0; i < flat_input.size(0); ++i) {
    auto input_vec = flat_input.index({i});
    auto output_vec = flat_output.index({i});
    code2x8_matvec(
      codes.squeeze(2),
      input_vec,
      output_vec,
      codebooks,
      codebook_a_sizes
    );
  }
  flat_output *= scales.flatten().unsqueeze(0);
  if (bias.has_value()) {
    flat_output += bias->unsqueeze(0);
  }
  auto output_sizes = input_sizes.vec();
  output_sizes.pop_back();
  output_sizes.push_back(-1);
  auto output = flat_output.reshape(output_sizes);
  return output;
 }
 // Accumulate the partition sizes.
 int4 accumulate_sizes(const torch::Tensor& codebook_partition_sizes)
 {
  int4 cumulative_sizes;
  auto cumulative_size = &cumulative_sizes.x;
  int i = 0;
  int last = 0;
  assert(codebook_partition_sizes.size(0) <= 4);
  for (; i <  codebook_partition_sizes.size(0); ++i, ++cumulative_size)
  {
    *cumulative_size = codebook_partition_sizes[i].item<int>() + last;
    last = *cumulative_size;
  }
  // fill in the rest with unreachable.
  for (; i < 4; ++i, ++cumulative_size)
  {
    *cumulative_size = last*10;
  }
  return cumulative_sizes;
 }
 } // namespace aqlm
 } // namespace vllm
 torch::Tensor aqlm_gemm(
  const torch::Tensor& input,
  const torch::Tensor& codes,
  const torch::Tensor& codebooks,
  const torch::Tensor& scales,
  const torch::Tensor& codebook_partition_sizes,
  const std::optional<torch::Tensor>& bias
 )
 {
  int4 cumulative_sizes = vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
  int const entries = codebooks.size(1);
  if (nbooks == 1 && entries == (1 << 16))
  { 
    return vllm::aqlm::code1x16_matmat(input, codes, codebooks, scales, cumulative_sizes, bias);
  }
  if (nbooks == 2 && entries == (1 << 8))
  {
    return vllm::aqlm::code2x8_matmat(input, codes, codebooks, scales, cumulative_sizes, bias);
  }
  TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.")
  return {};
 }
 torch::Tensor aqlm_dequant(
  const torch::Tensor& codes,
  const torch::Tensor& codebooks,
  const torch::Tensor& codebook_partition_sizes
 )
 {
  int4 cumulative_sizes = vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
  int const entries = codebooks.size(1);
  const at::cuda::OptionalCUDAGuard device_guard(device_of(codes));
  int rows = codes.size(1);
  int cols = codes.size(0);
  auto in_features = codes.size(1) * 8;
  auto out_features = codes.size(0);
  assert(out_features = codebook_partition_sizes.sum().item<int>());
  auto weights = torch::empty({out_features, in_features},
    torch::TensorOptions()
      .dtype(codebooks.dtype())
      .device(codebooks.device())
  );
  if (nbooks == 1 && entries == (1 << 16))
  {
    vllm::aqlm::code1x16_dequant_cuda(
      codes.data_ptr(),
      weights.data_ptr(),
      codebooks.data_ptr(),
      out_features,
      in_features,
      cumulative_sizes,
      vllm::aqlm::codebook_stride(codebooks));
    // if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation.)
    // weights *= scales.index({"...", 0, 0});
     return weights;
  }
  if (nbooks == 2 && entries == (1 << 8))
  {
     vllm::aqlm::code2x8_dequant_cuda(
        codes.data_ptr(), 
        weights.data_ptr(), 
        codebooks.data_ptr(), 
        out_features,
        in_features, 
        cumulative_sizes, 
        vllm::aqlm::codebook_stride(codebooks));
    // if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation)
    // weights *= scales.index({"...", 0, 0});
     return weights;
  }
  TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.")
  return {};
 }
--- a/csrc/quantization/fp8/amd_detail/hip_float8.h
+++ b/csrc/quantization/fp8/amd_detail/hip_float8.h
@@ -0,0 +1,167 @@
 #pragma once
 #ifdef __HIPCC__
 #include <hip/hip_runtime.h>
 #else
 #include <type_traits>
 #include <stdint.h>
 #include <math.h>
 #include <iostream>
 #endif
 #include "hip_float8_impl.h"
 struct alignas(1) hip_fp8
 {
    struct from_bits_t
    {
    };
    HIP_FP8_HOST_DEVICE static constexpr from_bits_t from_bits() { return from_bits_t(); }
    uint8_t data;
    hip_fp8() = default;
    HIP_FP8_HOST_DEVICE constexpr hip_fp8(const hip_fp8&) = default;
    HIP_FP8_HOST_DEVICE constexpr hip_fp8(uint8_t v) = delete;
    explicit HIP_FP8_HOST_DEVICE constexpr hip_fp8(uint8_t v, from_bits_t)
        : data(v)
    {
    }
 #ifdef __HIP__MI300__
    // NOTE: ON-DEVICE... always optimal bias
    explicit HIP_FP8_DEVICE hip_fp8(float v)
        : data(hip_fp8_impl::to_fp8_from_fp32(v))
    {
    }
    explicit HIP_FP8_DEVICE hip_fp8(_Float16 v)
        : hip_fp8(static_cast<float>(v))
    {
    }
    // Host only implementation using s/w simulation
    explicit HIP_FP8_HOST
 #else  // __HIP__MI300__
    // both Host and DEVICE for non-MI300 using s/w simulation
    explicit HIP_FP8_HOST_DEVICE
 #endif // __HIP__MI300__
    hip_fp8(float v)
    {
        data = hip_fp8_impl::to_float8<4, 3, float, true /*negative_zero_nan*/, true /*clip*/>(v);
    }
    explicit HIP_FP8_HOST_DEVICE hip_fp8(double v)
        : hip_fp8(static_cast<float>(v))
    {
    }
 #ifdef __HIP__MI300__
    // upcast using device specific intrinsic
    explicit inline HIP_FP8_DEVICE operator float() const
    {
        float fval;
        uint32_t i32val = static_cast<uint32_t>(data);
        // upcast
        asm volatile("v_cvt_f32_fp8 %0, %1 src0_sel:BYTE_0" : "=v"(fval) : "v"(i32val));
        return fval;
    }
    explicit inline HIP_FP8_HOST operator float() const
 #else  // __HIP__MI300__
    explicit inline HIP_FP8_HOST_DEVICE operator float() const
 #endif // __HIP__MI300__
    {
        return hip_fp8_impl::from_float8<4, 3, float, true /*negative_zero_nan*/>(data);
    }
 };
 namespace std
 {
 inline hip_fp8 sin(hip_fp8 a)
 {
    return hip_fp8(sinf(float(a)));
 }
 inline hip_fp8 cos(hip_fp8 a)
 {
    return hip_fp8(cosf(float(a)));
 }
 HIP_FP8_HOST_DEVICE constexpr hip_fp8 real(const hip_fp8& a)
 {
    return a;
 }
 } // namespace std
 // Special operator overloading
 inline std::ostream& operator<<(std::ostream& os, const hip_fp8& f8)
 {
    return os << float(f8);
 }
 // all + operator overloading with mixed types
 // mixed types, always converts to f32, does computation in f32, and returns float
 inline HIP_FP8_HOST_DEVICE float operator+(const float fa, hip_fp8 b)
 {
    return (fa + float(b));
 }
 inline HIP_FP8_HOST_DEVICE float operator+(hip_fp8 a, const float fb)
 {
    return (float(a) + fb);
 }
 inline HIP_FP8_HOST_DEVICE hip_fp8 operator+(hip_fp8 a, hip_fp8 b)
 {
    return hip_fp8(float(a) + float(b));
 }
 inline HIP_FP8_HOST_DEVICE hip_fp8& operator+=(hip_fp8& a, hip_fp8 b)
 {
    return a = hip_fp8(float(a) + float(b));
 }
 // overloading multiplication, always returns float,
 inline HIP_FP8_HOST_DEVICE float operator*(hip_fp8 a, hip_fp8 b)
 {
    return float(a) * float(b);
 }
 inline HIP_FP8_HOST_DEVICE float operator*(float a, hip_fp8 b)
 {
    return (a * float(b));
 }
 inline HIP_FP8_HOST_DEVICE float operator*(hip_fp8 a, float b)
 {
    return (float(a) * b);
 }
 inline HIP_FP8_HOST_DEVICE float operator*(int32_t a, hip_fp8 b)
 {
    return ((float)a * float(b));
 }
 inline HIP_FP8_HOST_DEVICE float operator*(double a, hip_fp8 b)
 {
    return ((float)a * float(b));
 }
 // overloading for compare
 inline HIP_FP8_HOST_DEVICE bool operator==(hip_fp8 a, hip_fp8 b)
 {
    return (a.data == b.data);
 }
 inline HIP_FP8_HOST_DEVICE bool operator!=(hip_fp8 a, hip_fp8 b)
 {
    return (a.data != b.data);
 }
 inline HIP_FP8_HOST_DEVICE bool operator>=(hip_fp8 a, hip_fp8 b)
 {
    return static_cast<float>(a) >= static_cast<float>(b);
 }
 inline HIP_FP8_HOST_DEVICE bool operator>(hip_fp8 a, hip_fp8 b)
 {
    return static_cast<float>(a) > static_cast<float>(b);
 }
--- a/csrc/quantization/fp8/amd_detail/hip_float8_impl.h
+++ b/csrc/quantization/fp8/amd_detail/hip_float8_impl.h
@@ -0,0 +1,316 @@
 #pragma once
 #if defined(__HIPCC__) && (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
 #define __HIP__MI300__
 #endif
 #ifdef __HIPCC__
 #define HIP_FP8_HOST_DEVICE __host__ __device__
 #define HIP_FP8_HOST __host__
 #define HIP_FP8_DEVICE __device__
 #else
 #define HIP_FP8_HOST_DEVICE
 #define HIP_FP8_HOST
 #define HIP_FP8_DEVICE
 #endif
 namespace hip_fp8_impl
 {
 #ifdef __HIP__MI300__
 HIP_FP8_DEVICE uint8_t to_fp8_from_fp32(float v)
 {
    uint8_t i8data;
    union {
        float fval;
        uint32_t i32val;
        uint8_t i8val[4]; // NOTE: not endian independent
    } val;
    uint32_t ival = 0;
    val.fval = v;
    if ((val.i32val & 0x7F800000) != 0x7F800000) { /// propagate NAN/INF, no clipping
        val.fval = __builtin_amdgcn_fmed3f(val.fval, 240.0, -240.0);
    }
    ival = __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival,
        false); // false -> WORD0
    val.i32val = ival;
    i8data = val.i8val[0];
    return i8data;
 }
 #endif // __HIP__MI300__
 HIP_FP8_HOST inline int clz(uint32_t x)
 {
    return __builtin_clz(x);
 }
 #if defined(__HIPCC__) || defined(__CUDA_ARCH__)
 HIP_FP8_DEVICE inline int clz(uint32_t x)
 {
    return __clz(x);
 }
 #endif
 template <int we, int wm, typename T, bool negative_zero_nan, bool clip>
 HIP_FP8_HOST_DEVICE uint8_t to_float8(T _x, bool stoch = false, uint32_t rng = 0)
 {
 #ifdef __HIPCC__
    constexpr bool is_half = std::is_same<T, _Float16>::value;
 #else
    constexpr bool is_half = false;
 #endif
    constexpr bool is_float = std::is_same<T, float>::value;
    static_assert(wm + we == 7, "wm+we==7");
    static_assert(is_half || is_float, "Only half and float can be cast to f8");
    const int mfmt = (sizeof(T) == 4) ? 23 : 10;
    uint32_t x;
    if (sizeof(T) == 4) {
        x = reinterpret_cast<uint32_t&>(_x);
    } else {
        x = reinterpret_cast<uint16_t&>(_x);
    }
    uint32_t head, mantissa;
    int exponent, bias;
    uint32_t sign;
    if (sizeof(T) == 4) {
        head = x & 0xFF800000;
        mantissa = x & 0x7FFFFF;
        exponent = (head >> 23) & 0xFF;
        sign = head >> 31;
        bias = 127;
    } else {
        head = x & 0xFC00;
        mantissa = x & 0x3FF;
        exponent = (head >> 10) & 0x1F;
        sign = head >> 15;
        bias = 15;
    }
    uint32_t signed_inf = (sign << 7) + (((1 << we) - 1) << wm);
    // Deal with inf and NaNs
    if (negative_zero_nan) {
        if (sizeof(T) == 4) {
            if ((x & 0x7F800000) == 0x7F800000) {
                return 0x80;
            }
        } else {
            // if(__hisinf(x) || __hisnan(x))
            if ((x & 0x7C00) == 0x7C00) {
                return 0x80;
            }
        }
    } else {
        if (sizeof(T) == 4) {
            if ((x & 0x7F800000) == 0x7F800000) {
                return signed_inf + (mantissa != 0 ? 1 : 0);
            }
        } else {
            if ((x & 0x7C00) == 0x7C00) {
                return signed_inf + (mantissa != 0 ? 1 : 0);
            }
        }
    }
    if (x == 0) {
        return 0;
    }
    // First need to check if it is normal or denorm as there is a difference of
    // implicit 1 Then need to adjust the exponent to align with the F8 exponent,
    // in the meanwhile, shift The mantissa. Then for stochastic rounding, add rng
    // to mantissa and truncate. And for RNE, no need to add rng. Then probably
    // need to check whether there is carry and adjust exponent and mantissa again
    // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent
    // bits
    const int f8_bias = (1 << (we - 1)) - 1 + (negative_zero_nan ? 1 : 0);
    const int f8_denormal_act_exponent = 1 - f8_bias; // actual exponent of f8 denormal
    // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias)
    // f8_exponent is the converted f8 exponent with bias encoding
    // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent,
    // the difference needs to be adjusted and mantissa shifted
    int act_exponent, f8_exponent, exponent_diff;
    if (exponent == 0) { // fp32/fp16 is in denormal.
        /* fp32 denormal is below 2^-127 so it is usually not a concern here, we
 mostly concern fp16 here. In this case, f8 is usually in denormal. But there
 could be exceptions. fp16 denormal has exponent bias 15 while bf8 with NANOO has
 exponent bias 16. It means that there are some numbers in fp16 denormal but they
 are bf8 (NANOO) normals - smallest bf8 (NANOO) normal is 2^-15. fp16 numbers
 where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8
 (NANOO) normal. In this case, the fp16 mantissa should be shift left by 1  */
        act_exponent = exponent - bias + 1;
        exponent_diff = f8_denormal_act_exponent - act_exponent; // actual exponent is exponent-bias+1 as it is denormal
    } else {                                                     // fp32/fp16 is normal with implicit 1
        act_exponent = exponent - bias;
        if (act_exponent <= f8_denormal_act_exponent) {
            /* This is the case where fp32/fp16 is normal but it is in f8 denormal
 range. For example fp8 nanoo mode, denormal exponent is -7, but if the
 fp32/fp16 actual exponent is -7, it is actually larger due to the implicit 1,
 Therefore it needs to be adjust to -6 and mantissa shift right by 1.
 So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */
            exponent_diff = f8_denormal_act_exponent - act_exponent;
        } else {               // both fp32/fp16 and f8 are in normal range
            exponent_diff = 0; // exponent_diff=0 does not mean there is no difference
                               // for this case,
                               // act_exponent could be larger. Just that it does not need shift mantissa
        }
        mantissa += (1 << mfmt); // Add the implicit 1 into mantissa
    }
    bool midpoint = (mantissa & ((1 << (mfmt - wm + exponent_diff)) - 1)) ==
                    static_cast<uint32_t>(1 << (mfmt - wm + exponent_diff - 1));
    /* This part is a bit tricky. The judgment of whether it is a tie needs to be
   done before we shift right as shift right could rip off some residual part
   and make something not midpoint look like midpoint. For example, the fp16
   number 0x1002 (0 00100 0000000010), it is larger than midpoint, but after
   shift right by 4 bits, it would look like midpoint.
 */
    if (exponent_diff > 0) {
        mantissa >>= exponent_diff;
    } else if (exponent_diff == -1) {
        mantissa <<= -exponent_diff;
    }
    bool implicit_one = mantissa & (1 << mfmt);
    // if there is no implicit 1, it  means the f8 is denormal and need to adjust
    // to denorm exponent
    f8_exponent = (act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias - (implicit_one ? 0 : 1);
    // Now we have the exponent and mantissa adjusted
    uint32_t drop_mask = (1 << (mfmt - wm)) - 1;
    bool odd = mantissa & (1 << (mfmt - wm)); // if the least significant bit that
                                              // is not truncated is 1
    mantissa += (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1) : mantissa)) & drop_mask;
    // Now we deal with overflow
    if (f8_exponent == 0) {
        if ((1 << mfmt) & mantissa) {
            f8_exponent = 1; // denormal overflow to become normal, promote exponent
        }
    } else {
        if ((1 << (mfmt + 1)) & mantissa) {
            mantissa >>= 1;
            f8_exponent++;
        }
    }
    mantissa >>= (mfmt - wm);
    // above range: quantize to maximum possible float of the same sign
    const int max_exp = (1 << we) - (negative_zero_nan ? 1 : 2);
    if (f8_exponent > max_exp) {
        if (clip) {
            mantissa = (1 << wm) - 1;
            f8_exponent = max_exp;
        } else {
            return signed_inf;
        }
    }
    if (f8_exponent == 0 && mantissa == 0) {
        return negative_zero_nan ? 0 : (sign << 7);
    }
    mantissa &= (1 << wm) - 1;
    return (sign << 7) | (f8_exponent << wm) | mantissa;
 }
 template <int we, int wm, typename T = float, bool negative_zero_nan = true>
 inline HIP_FP8_HOST_DEVICE T from_float8(uint8_t x)
 {
 #ifdef __HIPCC__
    constexpr bool is_half = std::is_same<T, _Float16>::value;
 #else
    constexpr bool is_half = false;
 #endif
    constexpr bool is_float = std::is_same<T, float>::value;
    static_assert(is_half || is_float, "only half and float are supported");
    constexpr int weo = is_half ? 5 : 8;
    constexpr int wmo = is_half ? 10 : (is_float ? 23 : 7);
    T fInf, fNegInf, fNaN, fNeg0;
 #ifdef __HIPCC__
    if (is_half) {
        const uint16_t ihInf = 0x7C00;
        const uint16_t ihNegInf = 0xFC00;
        const uint16_t ihNaN = 0x7C01;
        const uint16_t ihNeg0 = 0x8000;
        fInf = reinterpret_cast<const _Float16&>(ihInf);
        fNegInf = reinterpret_cast<const _Float16&>(ihNegInf);
        fNaN = reinterpret_cast<const _Float16&>(ihNaN);
        fNeg0 = reinterpret_cast<const _Float16&>(ihNeg0);
    } else
 #endif
        if (is_float) {
        const uint32_t ifInf = 0x7F800000;
        const uint32_t ifNegInf = 0xFF800000;
        const uint32_t ifNaN = 0x7F800001;
        const uint32_t ifNeg0 = 0x80000000;
        fInf = reinterpret_cast<const float&>(ifInf);
        fNegInf = reinterpret_cast<const float&>(ifNegInf);
        fNaN = reinterpret_cast<const float&>(ifNaN);
        fNeg0 = reinterpret_cast<const float&>(ifNeg0);
    }
    if (x == 0) {
        return 0;
    }
    uint32_t sign = x >> 7;
    uint32_t mantissa = x & ((1 << wm) - 1);
    int exponent = (x & 0x7F) >> wm;
    if (negative_zero_nan) {
        if (x == 0x80) {
            return fNaN;
        }
    } else {
        if (x == 0x80) {
            return fNeg0;
        }
        if (exponent == ((1 << we) - 1)) {
            return (mantissa == 0) ? (sign ? fNegInf : fInf) : fNaN;
        }
    }
    typename std::conditional<sizeof(T) == 2, uint16_t, uint32_t>::type retval;
    if (we == 5 && is_half && !negative_zero_nan) {
        retval = x << 8;
        return reinterpret_cast<const T&>(retval);
    }
    const int exp_low_cutoff = (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (negative_zero_nan ? 1 : 0);
    // subnormal input
    if (exponent == 0) {
        // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
        int sh = 1 + clz(mantissa) - (32 - wm);
        mantissa <<= sh;
        exponent += 1 - sh;
        mantissa &= ((1 << wm) - 1);
    }
    exponent += exp_low_cutoff - 1;
    mantissa <<= wmo - wm;
    // subnormal output (occurs when T=half, we=5, negative_zero_nan=true)
    if (exponent <= 0) {
        mantissa |= 1 << wmo;
        mantissa >>= 1 - exponent;
        exponent = 0;
    }
    if (sizeof(T) == 2) {
        retval = (sign << 15) | (exponent << 10) | mantissa;
    } else {
        retval = (sign << 31) | (exponent << 23) | mantissa;
    }
    return reinterpret_cast<const T&>(retval);
 }
 } // namespace hip_fp8_impl
--- a/csrc/quantization/fp8/amd_detail/quant_utils.cuh
+++ b/csrc/quantization/fp8/amd_detail/quant_utils.cuh
@@ -0,0 +1,517 @@
 #pragma once
 #include "hip_float8.h"
 #include <hip/hip_fp16.h>
 #include <hip/hip_bf16.h>
 #include <hip/hip_bfloat16.h>
 #include "../../../attention/dtype_float32.cuh"
 #include "../../../attention/dtype_bfloat16.cuh"
 namespace vllm
 {
 namespace fp8_e4m3 {
 template <typename Tout, typename Tin>
 __inline__ __device__ Tout vec_conversion(const Tin& x)
 {
    return x;
 }
 template <typename Tout, typename Tin>
 __inline__ __device__ Tout scaled_vec_conversion(const Tin& x, const float scale)
 {
    return x;
 }
 // fp8 -> half
 template <>
 __inline__ __device__ uint16_t vec_conversion<uint16_t, uint8_t>(const uint8_t& a)
 {
    hip_fp8 f8{a, hip_fp8::from_bits()};
    __half_raw res;
    res.data = static_cast<float>(f8);
    return res.x;
 }
 // fp8x2 -> half2
 template <>
 __inline__ __device__ uint32_t vec_conversion<uint32_t, uint16_t>(const uint16_t& a)
 {
 #if defined(__HIP__MI300__) && defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
    const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
    union {
        __half2_raw h2r;
        uint32_t ui32;
    } tmp;
    tmp.h2r.x.data = f2[0];
    tmp.h2r.y.data = f2[1];
    return tmp.ui32;
 #else
    union {
        uint16_t u16[2];
        uint32_t u32;
    } tmp;
    tmp.u16[0] = vec_conversion<uint16_t, uint8_t>(static_cast<uint8_t>(a));
    tmp.u16[1] = vec_conversion<uint16_t, uint8_t>(static_cast<uint8_t>(a >> 8U));
    return tmp.u32;
 #endif
 }
 // fp8x4 -> half2x2
 template <>
 __inline__ __device__ uint2 vec_conversion<uint2, uint32_t>(const uint32_t& a)
 {
    union {
        uint2 u32x2;
        uint32_t u32[2];
    } tmp;
    tmp.u32[0] = vec_conversion<uint32_t, uint16_t>((uint16_t)a);
    tmp.u32[1] = vec_conversion<uint32_t, uint16_t>((uint16_t)(a >> 16U));
    return tmp.u32x2;
 }
 // fp8x8 -> half2x4
 template <>
 __inline__ __device__ uint4 vec_conversion<uint4, uint2>(const uint2& a)
 {
    union {
        uint4 u64x2;
        uint2 u64[2];
    } tmp;
    tmp.u64[0] = vec_conversion<uint2, uint32_t>(a.x);
    tmp.u64[1] = vec_conversion<uint2, uint32_t>(a.y);
    return tmp.u64x2;
 }
 using __nv_bfloat16 = __hip_bfloat16;
 // fp8 -> __nv_bfloat16
 template <>
 __inline__ __device__ __nv_bfloat16 vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a)
 {
    hip_fp8 f8{a, hip_fp8::from_bits()};
    float f{f8};
    return __float2bfloat16(f);
 }
 using __nv_bfloat162 = __hip_bfloat162;
 // fp8x2 -> __nv_bfloat162
 template <>
 __inline__ __device__ __nv_bfloat162 vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a)
 {
    __nv_bfloat162 res;
    res.x = vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a);
    res.y = vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)(a >> 8U));
    return res;
 }
 // fp8x4 -> bf16_4_t
 template <>
 __inline__ __device__ bf16_4_t vec_conversion<bf16_4_t, uint32_t>(const uint32_t& a)
 {
    bf16_4_t res;
    res.x = vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a);
    res.y = vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U));
    return res;
 }
 // fp8x8 -> bf16_8_t
 template <>
 __inline__ __device__ bf16_8_t vec_conversion<bf16_8_t, uint2>(const uint2& a)
 {
    bf16_4_t tmp1, tmp2;
    tmp1 = vec_conversion<bf16_4_t, uint32_t>(a.x);
    tmp2 = vec_conversion<bf16_4_t, uint32_t>(a.y);
    bf16_8_t res;
    res.x = tmp1.x;
    res.y = tmp1.y;
    res.z = tmp2.x;
    res.w = tmp2.y;
    return res;
 }
 // fp8 -> float
 template <>
 __inline__ __device__ float vec_conversion<float, uint8_t>(const uint8_t& a)
 {
    hip_fp8 fp8{a, hip_fp8::from_bits()};
    return static_cast<float>(fp8);
 }
 // fp8x2 -> float2
 template <>
 __inline__ __device__ float2 vec_conversion<float2, uint16_t>(const uint16_t& a)
 {
 #if defined(__HIP__MI300__) && defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
    float2 res;
    const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
    res.x = f2[0];
    res.y = f2[1];
    return res;
 #else
    float2 res;
    res.x = vec_conversion<float, uint8_t>(static_cast<uint8_t>(a));
    res.y = vec_conversion<float, uint8_t>(static_cast<uint8_t>(a >> 8U));
    return res;
 #endif
 }
 // fp8x4 -> float4
 template <>
 __inline__ __device__ Float4_ vec_conversion<Float4_, uint32_t>(const uint32_t& a)
 {
    Float4_ res;
    res.x = vec_conversion<float2, uint16_t>((uint16_t)a);
    res.y = vec_conversion<float2, uint16_t>((uint16_t)(a >> 16U));
    return res;
 }
 // fp8x8 -> float8
 template <>
 __inline__ __device__ Float8_ vec_conversion<Float8_, uint2>(const uint2& a)
 {
    Float4_ tmp1, tmp2;
    tmp1 = vec_conversion<Float4_, uint32_t>(a.x);
    tmp2 = vec_conversion<Float4_, uint32_t>(a.y);
    Float8_ res;
    res.x = tmp1.x;
    res.y = tmp1.y;
    res.z = tmp2.x;
    res.w = tmp2.y;
    return res;
 }
 // half -> fp8
 template <>
 __inline__ __device__ uint8_t vec_conversion<uint8_t, uint16_t>(const uint16_t& a)
 {
    __half_raw tmp;
    tmp.x = a;
    hip_fp8 f8{static_cast<float>(tmp.data)};
    return f8.data;
 }
 // bf16 -> fp8
 template <>
 __inline__ __device__ uint8_t vec_conversion<uint8_t, __nv_bfloat16>(const __nv_bfloat16& a)
 {
    hip_fp8 res{__bfloat162float(a)};
    return res.data;
 }
 // float -> fp8
 template <>
 __inline__ __device__ uint8_t vec_conversion<uint8_t, float>(const float& a)
 {
    hip_fp8 f8(a);
    return f8.data;
 }
 // fp8x4 -> float4
 template <>
 __inline__ __device__ float4 vec_conversion<float4, uint32_t>(const uint32_t& a)
 {
    Float4_ tmp = vec_conversion<Float4_, uint32_t>(a);
    float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y);
    return res;
 }
 // float2 -> half2
 template <>
 __inline__ __device__ uint32_t vec_conversion<uint32_t, float2>(const float2& a)
 {
    union {
        half2 float16;
        uint32_t uint32;
    };
    float16 = __float22half2_rn(a);
    return uint32;
 }
 // Float4 -> half2x2
 template <>
 __inline__ __device__ uint2 vec_conversion<uint2, Float4_>(const Float4_& a)
 {
    uint2 b;
    float2 val;
    val.x = a.x.x;
    val.y = a.x.y;
    b.x = vec_conversion<uint32_t, float2>(val);
    val.x = a.y.x;
    val.y = a.y.y;
    b.y = vec_conversion<uint32_t, float2>(val);
    return b;
 }
 // Float4 -> float4
 template <>
 __inline__ __device__ float4 vec_conversion<float4, Float4_>(const Float4_& a)
 {
    float4 b;
    b.x = a.x.x;
    b.y = a.x.y;
    b.z = a.y.x;
    b.w = a.y.y;
    return b;
 }
 // Float8 -> half2x4
 template <>
 __inline__ __device__ uint4 vec_conversion<uint4, Float8_>(const Float8_& a)
 {
    uint4 b;
    b.x = vec_conversion<uint32_t, float2>(a.x);
    b.y = vec_conversion<uint32_t, float2>(a.y);
    b.z = vec_conversion<uint32_t, float2>(a.z);
    b.w = vec_conversion<uint32_t, float2>(a.w);
    return b;
 }
 // float2 -> bfloat162
 template <>
 __inline__ __device__ __nv_bfloat162 vec_conversion<__nv_bfloat162, float2>(const float2& a)
 {
    __nv_bfloat162 b = __float22bfloat162_rn(a);
    return b;
 }
 // Float4 -> bfloat162x2
 template <>
 __inline__ __device__ bf16_4_t vec_conversion<bf16_4_t, Float4_>(const Float4_& a)
 {
    bf16_4_t b;
    b.x = __float22bfloat162_rn(a.x);
    b.y = __float22bfloat162_rn(a.y);
    return b;
 }
 // Float8 -> bfloat162x4
 template <>
 __inline__ __device__ bf16_8_t vec_conversion<bf16_8_t, Float8_>(const Float8_& a)
 {
    bf16_8_t b;
    b.x = __float22bfloat162_rn(a.x);
    b.y = __float22bfloat162_rn(a.y);
    b.z = __float22bfloat162_rn(a.z);
    b.w = __float22bfloat162_rn(a.w);
    return b;
 }
 /* Scaled and vectorized conversions, for data exchange between high and low precision domains
   Convention of the scale in API, e.g: FP8_data = Quantization( High_Precision_data / scale )
   s.t.
     Quantize(HP / scale) => FP8
     Dequant(FP8) * scale =>  HP
 */
 // fp8 -> half
 template <>
 __inline__ __device__ uint16_t scaled_vec_conversion<uint16_t, uint8_t>(const uint8_t& a, const float scale)
 {
    hip_fp8 f8{a, hip_fp8::from_bits()};
    __half_raw res;
    res.data = static_cast<float>(f8) * scale;
    return res.x;
 }
 // fp8x2 -> half2
 template <>
 __inline__ __device__ uint32_t scaled_vec_conversion<uint32_t, uint16_t>(const uint16_t& a, const float scale)
 {
 #if defined(__HIP__MI300__) && defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
    const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
    union {
        __half2_raw h2r;
        uint32_t ui32;
    } tmp;
    tmp.h2r.x.data = f2[0] * scale;
    tmp.h2r.y.data = f2[1] * scale;
    return tmp.ui32;
 #else
    union {
        uint16_t u16[2];
        uint32_t u32;
    } tmp;
    tmp.u16[0] = scaled_vec_conversion<uint16_t, uint8_t>(static_cast<uint8_t>(a), scale);
    tmp.u16[1] = scaled_vec_conversion<uint16_t, uint8_t>(static_cast<uint8_t>(a >> 8U), scale);
    return tmp.u32;
 #endif
 }
 // fp8x4 -> half2x2
 template <>
 __inline__ __device__ uint2 scaled_vec_conversion<uint2, uint32_t>(const uint32_t& a, const float scale)
 {
    union {
        uint2 u32x2;
        uint32_t u32[2];
    } tmp;
    tmp.u32[0] = scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)a, scale);
    tmp.u32[1] = scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)(a >> 16U), scale);
    return tmp.u32x2;
 }
 // fp8x8 -> half2x4
 template <>
 __inline__ __device__ uint4 scaled_vec_conversion<uint4, uint2>(const uint2& a, const float scale)
 {
    union {
        uint4 u64x2;
        uint2 u64[2];
    } tmp;
    tmp.u64[0] = scaled_vec_conversion<uint2, uint32_t>(a.x, scale);
    tmp.u64[1] = scaled_vec_conversion<uint2, uint32_t>(a.y, scale);
    return tmp.u64x2;
 }
 using __nv_bfloat16 = __hip_bfloat16;
 // fp8 -> __nv_bfloat16
 template <>
 __inline__ __device__ __nv_bfloat16 scaled_vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a, const float scale)
 {
    hip_fp8 f8{a, hip_fp8::from_bits()};
    float f{f8};
    return __float2bfloat16(f * scale);
 }
 using __nv_bfloat162 = __hip_bfloat162;
 // fp8x2 -> __nv_bfloat162
 template <>
 __inline__ __device__ __nv_bfloat162 scaled_vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a, const float scale)
 {
    __nv_bfloat162 res;
    res.x = scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a, scale);
    res.y = scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)(a >> 8U), scale);
    return res;
 }
 // fp8x4 -> bf16_4_t
 template <>
 __inline__ __device__ bf16_4_t scaled_vec_conversion<bf16_4_t, uint32_t>(const uint32_t& a, const float scale)
 {
    bf16_4_t res;
    res.x = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a, scale);
    res.y = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U), scale);
    return res;
 }
 // fp8x8 -> bf16_8_t
 template <>
 __inline__ __device__ bf16_8_t scaled_vec_conversion<bf16_8_t, uint2>(const uint2& a, const float scale)
 {
    bf16_4_t tmp1, tmp2;
    tmp1 = scaled_vec_conversion<bf16_4_t, uint32_t>(a.x, scale);
    tmp2 = scaled_vec_conversion<bf16_4_t, uint32_t>(a.y, scale);
    bf16_8_t res;
    res.x = tmp1.x;
    res.y = tmp1.y;
    res.z = tmp2.x;
    res.w = tmp2.y;
    return res;
 }
 // fp8 -> float
 template <>
 __inline__ __device__ float scaled_vec_conversion<float, uint8_t>(const uint8_t& a, const float scale)
 {
    hip_fp8 fp8{a, hip_fp8::from_bits()};
    return static_cast<float>(fp8) * scale;
 }
 // fp8x2 -> float2
 template <>
 __inline__ __device__ float2 scaled_vec_conversion<float2, uint16_t>(const uint16_t& a, const float scale)
 {
 #if defined(__HIP__MI300__) && defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
    float2 res;
    const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
    res.x = f2[0] * scale;
    res.y = f2[1] * scale;
    return res;
 #else
    float2 res;
    res.x = scaled_vec_conversion<float, uint8_t>(static_cast<uint8_t>(a), scale);
    res.y = scaled_vec_conversion<float, uint8_t>(static_cast<uint8_t>(a >> 8U), scale);
    return res;
 #endif
 }
 // fp8x4 -> float4
 template <>
 __inline__ __device__ Float4_ scaled_vec_conversion<Float4_, uint32_t>(const uint32_t& a, const float scale)
 {
    Float4_ res;
    res.x = scaled_vec_conversion<float2, uint16_t>((uint16_t)a, scale);
    res.y = scaled_vec_conversion<float2, uint16_t>((uint16_t)(a >> 16U), scale);
    return res;
 }
 // fp8x8 -> float8
 template <>
 __inline__ __device__ Float8_ scaled_vec_conversion<Float8_, uint2>(const uint2& a, const float scale)
 {
    Float4_ tmp1, tmp2;
    tmp1 = scaled_vec_conversion<Float4_, uint32_t>(a.x, scale);
    tmp2 = scaled_vec_conversion<Float4_, uint32_t>(a.y, scale);
    Float8_ res;
    res.x = tmp1.x;
    res.y = tmp1.y;
    res.z = tmp2.x;
    res.w = tmp2.y;
    return res;
 }
 /* Quantize(HP / scale) => FP8 */
 // TODO(Hai): vectorized to add
 // half -> fp8
 template <>
 __inline__ __device__ uint8_t scaled_vec_conversion<uint8_t, uint16_t>(const uint16_t& a, const float scale)
 {
    __half_raw tmp;
    tmp.x = a;
    hip_fp8 f8{static_cast<float>(tmp.data)/scale};
    return f8.data;
 }
 // bf16 -> fp8
 template <>
 __inline__ __device__ uint8_t scaled_vec_conversion<uint8_t, __nv_bfloat16>(const __nv_bfloat16& a, const float scale)
 {
    hip_fp8 res{__bfloat162float(a)/scale};
    return res.data;
 }
 // float -> fp8
 template <>
 __inline__ __device__ uint8_t scaled_vec_conversion<uint8_t, float>(const float& a, const float scale)
 {
    hip_fp8 f8(a/scale);
    return f8.data;
 }
 // fp8x4 -> float4
 template <>
 __inline__ __device__ float4 scaled_vec_conversion<float4, uint32_t>(const uint32_t& a, const float scale)
 {
    Float4_ tmp = scaled_vec_conversion<Float4_, uint32_t>(a, scale);
    float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y);
    return res;
 }
 }
 } // namespace vllm
--- a/csrc/quantization/fp8/fp8_cuda_kernels.cu
+++ b/csrc/quantization/fp8/fp8_cuda_kernels.cu
@@ -0,0 +1,126 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/extension.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <cmath>
 #include "cuda_compat.h"
 #include "dispatch_utils.h"
 namespace vllm {
 __device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
    float old;
    old = (value >= 0) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) :
         __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
    return old;
 }
 // Compute the absolute maximum m of the input tensor and store
 // m / float8_e4m3::max() in *scale. Each thread block performs a
 // reduction tree and the memory in scale is atomically updated.
 // So to get the right answer, *scale needs to be initialized to
 // a value <= 0.0 and we need to wait for all thread blocks to
 // finish before consuming *scale.
 template<typename scalar_t>
 __global__ void segmented_max_reduction(
  float* __restrict__ scale,
  const scalar_t* __restrict__ input,
  int64_t num_elems) {
  __shared__ float cache[1024];
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  // First store maximum for all values processes by
  // the current thread in cache[threadIdx.x]
  scalar_t tmp = 0.0;
  while (i < num_elems) {
    float x = static_cast<float>(input[i]);
    tmp = max(tmp, fabs(x));
    i += blockDim.x * gridDim.x;
  }
  cache[threadIdx.x] = tmp;
  __syncthreads();
  // Now perform parallel reduction within the thread block
  int ib = blockDim.x / 2;
  while (ib != 0) {
    if (threadIdx.x < ib && cache[threadIdx.x + ib] > cache[threadIdx.x]) {
        cache[threadIdx.x] = cache[threadIdx.x + ib];
    }
    __syncthreads();
    ib /= 2;
  }
  // Finally, since cache[0] contains the maximum for this thread block,
  // atomically write the max to the target location
  if (threadIdx.x == 0) {
    atomicMaxFloat(scale, cache[0] / std::numeric_limits<c10::Float8_e4m3fn>::max());
  }
 }
 template<typename scalar_t>
 __global__ void scaled_fp8_quant_kernel(
  c10::Float8_e4m3fn* __restrict__ out,
  const scalar_t* __restrict__ input,
  const float* __restrict__ scale,
  int64_t num_elems) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  while (i < num_elems) {
    out[i] = static_cast<c10::Float8_e4m3fn>(input[i] / *scale);
    i += blockDim.x * gridDim.x;
  }
 }
 } // namespace vllm
 void static_scaled_fp8_quant(
  torch::Tensor& out,      // [..., d]
  torch::Tensor& input,    // [..., d]
  torch::Tensor& scale)    // [1]
 {
  int64_t num_tokens = input.numel() / input.size(-1);
  int64_t num_elems = input.numel();
  dim3 grid(num_tokens);
  dim3 block(1024);
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(
    input.scalar_type(),
    "scaled_fp8_quant_kernel",
    [&] {
      vllm::scaled_fp8_quant_kernel<scalar_t><<<grid, block, 0, stream>>>(
        out.data_ptr<c10::Float8_e4m3fn>(),
        input.data_ptr<scalar_t>(),
        scale.data_ptr<float>(),
        num_elems);
      });
 }
 void dynamic_scaled_fp8_quant(
  torch::Tensor& out,      // [..., d]
  torch::Tensor& input,    // [..., d]
  torch::Tensor& scale)    // [1]
 {
  int64_t num_tokens = input.numel() / input.size(-1);
  int64_t num_elems = input.numel();
  dim3 grid(num_tokens);
  dim3 block(1024);
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(
    input.scalar_type(),
    "scaled_fp8_quant_kernel",
    [&] {
      vllm::segmented_max_reduction<scalar_t><<<grid, block, 0, stream>>>(
        scale.data_ptr<float>(),
        input.data_ptr<scalar_t>(),
        num_elems);
      vllm::scaled_fp8_quant_kernel<scalar_t><<<grid, block, 0, stream>>>(
        out.data_ptr<c10::Float8_e4m3fn>(),
        input.data_ptr<scalar_t>(),
        scale.data_ptr<float>(),
        num_elems);
      });
 }
--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@@ -2067,7 +2067,7 @@ void gptq_shuffle
    const at::cuda::OptionalCUDAGuard device_guard(device_of(q_weight));
    vllm::gptq::shuffle_exllama_weight(
        (uint32_t*) q_weight.data_ptr(),
-        q_perm.device().is_meta() ? NULL : (int*) q_perm.data_ptr(),
+        q_perm.device().is_meta() || q_perm.numel() == 0 ? NULL : (int*) q_perm.data_ptr(),
        q_weight.size(0) * 32 / bit,
        q_weight.size(1),
        bit
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
--- a/Show More
+++ b/Show More