[Doc] add common case for long waiting time (#5430 )

[Core][Doc] Default to multiprocessing for single-node distributed case (#5230 )
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
2024-06-11 11:12:13 -07:00 · 2024-06-11 11:10:41 -07:00 · 2024-06-11 10:53:59 -07:00 · 2024-06-11 10:42:26 -07:00 · 2024-06-11 10:40:23 -07:00 · 2024-06-11 10:38:07 -07:00
772 changed files with 132175 additions and 10537 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -0,0 +1,36 @@
+import os
+import zipfile
+
+MAX_SIZE_MB = 200
+
+
+def print_top_10_largest_files(zip_file):
+    with zipfile.ZipFile(zip_file, 'r') as z:
+        file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
+        file_sizes.sort(key=lambda x: x[1], reverse=True)
+        for f, size in file_sizes[:10]:
+            print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
+
+
+def check_wheel_size(directory):
+    for root, _, files in os.walk(directory):
+        for f in files:
+            if f.endswith(".whl"):
+                wheel_path = os.path.join(root, f)
+                wheel_size = os.path.getsize(wheel_path)
+                wheel_size_mb = wheel_size / (1024 * 1024)
+                if wheel_size_mb > MAX_SIZE_MB:
+                    print(
+                        f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
+                        f"compare to the allowed size ({MAX_SIZE_MB} MB).")
+                    print_top_10_largest_files(wheel_path)
+                    return 1
+                else:
+                    print(f"Wheel {wheel_path} is within the allowed size "
+                          f"({wheel_size_mb} MB).")
+    return 0
+
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(check_wheel_size(sys.argv[1]))
--- a/.buildkite/download-images.sh
+++ b/.buildkite/download-images.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+set -ex
+set -o pipefail
+
+(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+
+# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
+mkdir -p images
+cd images
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
+
+cd -
--- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
+++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+# Install system packages
+apt update
+apt install -y curl jq
+
+# Install minijinja for templating
+curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh
+source $HOME/.cargo/env
+
+# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
+if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
+  PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
+
+  if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
+    echo "This PR has the 'perf-benchmarks' label. Proceeding with the nightly benchmarks."
+  else
+    echo "This PR does not have the 'perf-benchmarks' label. Skipping the nightly benchmarks."
+    exit 0
+  fi
+fi
+
+# Upload sample.yaml
+buildkite-agent pipeline upload .buildkite/nightly-benchmarks/sample.yaml
--- a/.buildkite/nightly-benchmarks/sample.yaml
+++ b/.buildkite/nightly-benchmarks/sample.yaml
@@ -0,0 +1,39 @@
+steps:
+  # NOTE(simon): You can create separate blocks for different jobs
+  - label: "A100: NVIDIA SMI"
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          containers:
+          # - image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT
+          # TODO(simon): check latest main branch or use the PR image.
+          - image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6
+            command:
+            - bash -c 'nvidia-smi && nvidia-smi topo -m && pwd && ls'
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+  # TODO(simon): bring H100 online
+  # - label: "H100: NVIDIA SMI"
+  #   agents:
+  #     queue: H100
+  #   plugins:
+  #   - docker#v5.11.0:
+  #       image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6
+  #       command:
+  #       - bash -c 'nvidia-smi && nvidia-smi topo -m'
+  #       propagate-environment: true
+  #       ipc: host
+  #       gpus: all
+
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -0,0 +1,73 @@
+# This script runs test inside the corresponding ROCm docker container.
+set -ex
+
+# Print ROCm version
+echo "--- ROCm info"
+rocminfo
+
+# cleanup older docker images
+cleanup_docker() {
+  # Get Docker's root directory
+  docker_root=$(docker info -f '{{.DockerRootDir}}')
+  if [ -z "$docker_root" ]; then
+    echo "Failed to determine Docker root directory."
+    exit 1
+  fi
+  echo "Docker root directory: $docker_root"
+  # Check disk usage of the filesystem where Docker's root directory is located
+  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+  # Define the threshold
+  threshold=70
+  if [ "$disk_usage" -gt "$threshold" ]; then
+    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+    # Remove dangling images (those that are not tagged and not used by any container)
+    docker image prune -f
+    # Remove unused volumes
+    docker volume prune -f
+    echo "Docker images and volumes cleanup completed."
+  else
+    echo "Disk usage is below $threshold%. No cleanup needed."
+  fi
+}
+
+# Call the cleanup docker function
+cleanup_docker
+
+echo "--- Resetting GPUs"
+
+echo "reset" > /opt/amdgpu/etc/gpu_state
+
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done
+
+echo "--- Building container"
+sha=$(git rev-parse --short HEAD)
+image_name=rocm_${sha}
+container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
+docker build \
+        -t ${image_name} \
+        -f Dockerfile.rocm \
+        --progress plain \
+        .
+
+remove_docker_container() {
+   docker rm -f ${container_name} || docker image rm -f ${image_name} || true
+}
+trap remove_docker_container EXIT
+
+echo "--- Running container"
+
+docker run \
+        --device /dev/kfd --device /dev/dri \
+        --network host \
+        --rm \
+        -e HF_TOKEN \
+        --name ${container_name} \
+        ${image_name} \
+        /bin/bash -c "${@}"
+
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@@ -0,0 +1,78 @@
+# This script is run by buildkite to run the benchmarks and upload the results to buildkite
+
+set -ex
+set -o pipefail
+
+# cd into parent directory of this file
+cd "$(dirname "${BASH_SOURCE[0]}")/.."
+
+(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+
+# run python-based benchmarks and upload the result to buildkite
+python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
+bench_latency_exit_code=$?
+
+python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
+bench_throughput_exit_code=$?
+
+# run server-based benchmarks and upload the result to buildkite
+python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
+server_pid=$!
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+# wait for server to start, timeout after 600 seconds
+timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
+python3 benchmarks/benchmark_serving.py \
+    --backend vllm \
+    --dataset-name sharegpt \
+    --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
+    --model meta-llama/Llama-2-7b-chat-hf \
+    --num-prompts 20 \
+    --endpoint /v1/completions \
+    --tokenizer meta-llama/Llama-2-7b-chat-hf \
+    --save-result \
+    2>&1 | tee benchmark_serving.txt
+bench_serving_exit_code=$?
+kill $server_pid
+
+# write the results into a markdown file
+echo "### Latency Benchmarks" >> benchmark_results.md
+sed -n '1p' benchmark_latency.txt >> benchmark_results.md # first line
+echo "" >> benchmark_results.md
+sed -n '$p' benchmark_latency.txt >> benchmark_results.md # last line
+
+echo "### Throughput Benchmarks" >> benchmark_results.md
+sed -n '1p' benchmark_throughput.txt >> benchmark_results.md # first line
+echo "" >> benchmark_results.md
+sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
+
+echo "### Serving Benchmarks" >> benchmark_results.md
+sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
+echo "" >> benchmark_results.md
+echo '```' >> benchmark_results.md
+tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
+echo '```' >> benchmark_results.md
+
+# if the agent binary is not found, skip uploading the results, exit 0
+if [ ! -f /usr/bin/buildkite-agent ]; then
+    exit 0
+fi
+
+# upload the results to buildkite
+buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
+
+# exit with the exit code of the benchmarks
+if [ $bench_latency_exit_code -ne 0 ]; then
+    exit $bench_latency_exit_code
+fi
+
+if [ $bench_throughput_exit_code -ne 0 ]; then
+    exit $bench_throughput_exit_code
+fi
+
+if [ $bench_serving_exit_code -ne 0 ]; then
+    exit $bench_serving_exit_code
+fi
+
+rm ShareGPT_V3_unfiltered_cleaned_split.json
+buildkite-agent artifact upload "*.json"
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -0,0 +1,24 @@
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t cpu-test -f Dockerfile.cpu .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f cpu-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image
+docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
+
+# offline inference
+docker exec cpu-test bash -c "python3 examples/offline_inference.py"
+
+# Run basic model test
+docker exec cpu-test bash -c "cd tests;
+  pip install pytest Pillow protobuf
+  bash ../.buildkite/download-images.sh
+  cd ../
+  pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -0,0 +1,51 @@
+# This script build the Neuron docker image and run the API server inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -e
+
+# Try building the docker image
+aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
+
+# prune old image and containers to save disk space, and only once a day
+# by using a timestamp file in tmp.
+if [ -f /tmp/neuron-docker-build-timestamp ]; then
+    last_build=$(cat /tmp/neuron-docker-build-timestamp)
+    current_time=$(date +%s)
+    if [ $((current_time - last_build)) -gt 86400 ]; then
+        docker system prune -f
+        echo $current_time > /tmp/neuron-docker-build-timestamp
+    fi
+else
+    echo $(date +%s) > /tmp/neuron-docker-build-timestamp
+fi
+
+docker build -t neuron -f Dockerfile.neuron .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f neuron || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image
+docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
+       --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
+
+# Wait for the server to start
+wait_for_server_to_start() {
+    timeout=300
+    counter=0
+
+    while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
+        sleep 1
+        counter=$((counter + 1))
+        if [ $counter -ge $timeout ]; then
+            echo "Timeout after $timeout seconds"
+            break
+        fi
+    done
+}
+wait_for_server_to_start
+
+# Test a simple prompt
+curl -X POST -H "Content-Type: application/json" \
+    localhost:8000/generate \
+    -d '{"prompt": "San Francisco is a"}'
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -0,0 +1,169 @@
+# In this file, you can add more tests to run either by adding a new step or
+# adding a new command to an existing step. See different options here for examples.
+# This script will be feed into Jinja template in `test-template.j2` to generate
+# the final pipeline yaml file.
+
+steps:
+- label: Regression Test
+  mirror_hardwares: [amd]
+  command: pytest -v -s test_regression.py
+  working_dir: "/vllm-workspace/tests" # optional
+
+- label: AsyncEngine Test
+  #mirror_hardwares: [amd]
+  command: pytest -v -s async_engine
+
+- label: Basic Correctness Test
+  mirror_hardwares: [amd]
+  commands:
+  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
+  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
+  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
+  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
+  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
+
+- label: Core Test
+  mirror_hardwares: [amd]
+  command: pytest -v -s core
+
+- label: Distributed Comm Ops Test
+  #mirror_hardwares: [amd]
+  command: pytest -v -s distributed/test_comm_ops.py
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+
+- label: Distributed Tests
+  mirror_hardwares: [amd]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  commands:
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+
+- label: Distributed Tests (Multiple Groups)
+  #mirror_hardwares: [amd]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  commands:
+  - pytest -v -s distributed/test_pynccl.py
+
+- label: Engine Test
+  mirror_hardwares: [amd]
+  command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
+
+- label: Entrypoints Test
+  mirror_hardwares: [amd]
+
+  commands:
+  - pytest -v -s entrypoints -m llm
+  - pytest -v -s entrypoints -m openai
+
+- label: Examples Test
+  working_dir: "/vllm-workspace/examples"
+  mirror_hardwares: [amd]
+  commands:
+    # install aws cli for llava_example.py
+    # install tensorizer for tensorize_vllm_model.py
+    - pip install awscli tensorizer
+    - python3 offline_inference.py
+    - python3 offline_inference_with_prefix.py
+    - python3 llm_engine_example.py
+    - python3 llava_example.py
+    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+
+- label: Inputs Test
+  #mirror_hardwares: [amd]
+  commands:
+    - bash ../.buildkite/download-images.sh
+    - pytest -v -s test_inputs.py
+    - pytest -v -s multimodal
+
+- label: Kernels Test %N
+  #mirror_hardwares: [amd]
+  command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 4
+
+- label: Models Test
+  #mirror_hardwares: [amd]
+  commands:
+    - pytest -v -s models -m \"not llava\"
+
+- label: Llava Test
+  mirror_hardwares: [amd]
+  commands:
+    - bash ../.buildkite/download-images.sh
+    - pytest -v -s models -m llava
+
+- label: Prefix Caching Test
+  mirror_hardwares: [amd]
+  commands:
+    - pytest -v -s prefix_caching
+
+- label: Samplers Test
+  #mirror_hardwares: [amd]
+  command: pytest -v -s samplers
+
+- label: LogitsProcessor Test
+  mirror_hardwares: [amd]
+  command: pytest -v -s test_logits_processor.py
+
+- label: Utils Test
+  command: pytest -v -s test_utils.py
+
+- label: Worker Test
+  mirror_hardwares: [amd]
+  command: pytest -v -s worker
+
+- label: Speculative decoding tests
+  #mirror_hardwares: [amd]
+  commands:
+    # See https://github.com/vllm-project/vllm/issues/5152
+    - export VLLM_ATTENTION_BACKEND=XFORMERS
+    - pytest -v -s spec_decode
+
+- label: LoRA Test %N
+  #mirror_hardwares: [amd]
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
+  parallelism: 4
+
+- label: LoRA Long Context (Distributed)
+  #mirror_hardwares: [amd]
+  num_gpus: 4
+  # This test runs llama 13B, so it is required to run on 4 GPUs.
+  commands:
+    - pytest -v -s -x lora/test_long_context.py
+
+- label: Tensorizer Test
+  #mirror_hardwares: [amd]
+  command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
+
+- label: Metrics Test
+  mirror_hardwares: [amd]
+  command: pytest -v -s metrics
+
+- label: Quantization Test
+  #mirror_hardwares: [amd]
+  command: pytest -v -s quantization
+
+- label: Benchmarks
+  working_dir: "/vllm-workspace/.buildkite"
+  mirror_hardwares: [amd]
+  commands:
+  - pip install aiohttp
+  - bash run-benchmarks.sh
+
+- label: Documentation Build
+  working_dir: "/vllm-workspace/test_docs/docs"
+  no_gpu: True
+  commands:
+  - pip install -r requirements-docs.txt
+  - SPHINXOPTS=\"-W\" make html
--- a/.buildkite/test-template-aws.j2
+++ b/.buildkite/test-template-aws.j2
@@ -0,0 +1,64 @@
+{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
+{% set default_working_dir = "/vllm-workspace/tests" %}
+
+steps:
+  - label: ":docker: build image"
+    agents:
+      queue: cpu_queue
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
+      - "docker push {{ docker_image }}"
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
+  - wait
+
+  {% for step in steps %}
+  - label: "{{ step.label }}"
+    agents:
+      {% if step.label == "Documentation Build" %}
+      queue: small_cpu_queue
+      {% elif step.no_gpu %}
+      queue: cpu_queue
+      {% elif step.num_gpus == 2 or step.num_gpus == 4 %}
+      queue: gpu_4_queue
+      {% else %}
+      queue: gpu_1_queue
+      {% endif %}
+    soft_fail: true
+    {% if step.parallelism %}
+    parallelism: {{ step.parallelism }}
+    {% endif %}
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
+    plugins:
+      - docker#v5.2.0:
+          image: {{ docker_image }}
+          always-pull: true
+          propagate-environment: true
+          {% if not step.no_gpu %}
+          gpus: all
+          {% endif %}
+          {% if step.label == "Benchmarks" %}
+          mount-buildkite-agent: true
+          {% endif %}
+          command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}"]
+          environment:
+            - VLLM_USAGE_SOURCE=ci-test
+            - HF_TOKEN
+            {% if step.label == "Speculative decoding tests" %}
+            - VLLM_ATTENTION_BACKEND=XFORMERS
+            {% endif %}
+          volumes:
+            - /dev/shm:/dev/shm
+  {% endfor %}
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -0,0 +1,96 @@
+{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
+{% set default_num_gpu = 1 %}
+{% set default_working_dir = "/vllm-workspace/tests" %}
+
+steps:
+  - label: ":docker: build image"
+    commands:
+      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
+      - "docker push {{ docker_image }}"
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
+  - wait
+
+  - group: "AMD Tests"
+    depends_on: ~
+    steps:
+    {% for step in steps %}
+    {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
+      - label: "AMD: {{ step.label }}"
+        agents:
+          queue: amd
+        command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" ; ")) | safe }}"
+        env:
+          DOCKER_BUILDKIT: "1"
+        soft_fail: true
+    {% endif %}
+    {% endfor %}
+
+  - label: "Neuron Test"
+    depends_on: ~
+    agents:
+      queue: neuron
+    command: bash .buildkite/run-neuron-test.sh
+    soft_fail: false
+
+  - label: "Intel Test"
+    depends_on: ~
+    agents:
+      queue: intel
+    command: bash .buildkite/run-cpu-test.sh
+
+  {% for step in steps %}
+  - label: "{{ step.label }}"
+    agents:
+      queue: kubernetes
+    soft_fail: {{ step.soft_fail or false }}
+    {% if step.parallelism %}
+    parallelism: {{ step.parallelism }}
+    {% endif %}
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
+    plugins:
+      - kubernetes:
+          podSpec:
+            {% if step.num_gpus %}
+            priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
+            {% endif %}
+            volumes:
+              - name: dshm
+                emptyDir:
+                  medium: Memory
+            containers:
+              - image: "{{ docker_image }}"
+                command: ["bash"]
+                args:
+                - '-c'
+                - "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'"
+                {% if not step.no_gpu %}
+                resources:
+                  requests:
+                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
+                  limits:
+                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
+                {% endif %}
+                env:
+                  - name: VLLM_USAGE_SOURCE
+                    value: ci-test
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+                volumeMounts:
+                  - mountPath: /dev/shm
+                    name: dshm
+  {% endfor %}
--- a/.clang-format
+++ b/.clang-format
@@ -0,0 +1,26 @@
+BasedOnStyle: Google
+UseTab: Never
+IndentWidth: 2
+ColumnLimit: 80
+
+# Force pointers to the type for C++.
+DerivePointerAlignment: false
+PointerAlignment: Left
+
+# Reordering #include statements can (and currently will) introduce errors
+SortIncludes: false
+
+# Style choices
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+IndentPPDirectives: BeforeHash
+
+IncludeCategories:
+  - Regex:           '^<'
+    Priority:        4
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
+    Priority:        3
+  - Regex:           '^"(qoda|\.\.)/'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        1
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1 @@
+vllm/*.so
--- a/.github/ISSUE_TEMPLATE/100-documentation.yml
+++ b/.github/ISSUE_TEMPLATE/100-documentation.yml
@@ -0,0 +1,22 @@
+name: 📚 Documentation
+description: Report an issue related to https://docs.vllm.ai/
+title: "[Doc]: "
+labels: ["documentation"]
+
+body:
+- type: textarea
+  attributes:
+    label: 📚 The doc issue
+    description: >
+      A clear and concise description of what content in https://docs.vllm.ai/ is an issue.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Suggest a potential alternative/fix
+    description: >
+      Tell us how we could improve the documentation in this regard.
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@@ -0,0 +1,40 @@
+name: 🛠️ Installation
+description: Report an issue here when you hit errors during installation.
+title: "[Installation]: "
+labels: ["installation"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: Your current environment
+    description: |
+      Please run the following and paste the output below.
+      ```sh
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      # For security purposes, please feel free to check the contents of collect_env.py before running it.
+      python collect_env.py
+      ```
+      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
+    value: |
+      ```text
+      The output of `python collect_env.py`
+      ```
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: How you are installing vllm
+    description: |
+      Paste the full command you are trying to execute.
+    value: |
+      ```sh
+      pip install -vvv vllm
+      ```
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@@ -0,0 +1,38 @@
+name: 💻 Usage
+description: Raise an issue here if you don't know how to use vllm.
+title: "[Usage]: "
+labels: ["usage"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: Your current environment
+    description: |
+      Please run the following and paste the output below.
+      ```sh
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      # For security purposes, please feel free to check the contents of collect_env.py before running it.
+      python collect_env.py
+      ```
+      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
+    value: |
+      ```text
+      The output of `python collect_env.py`
+      ```
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: How would you like to use vllm
+    description: |
+      A detailed description of how you want to use vllm.
+    value: |
+      I want to run inference of a [specific model](put link here). I don't know how to integrate it with vllm.
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/400-bug
+++ b/.github/ISSUE_TEMPLATE/400-bug
@@ -0,0 +1,86 @@
+name: 🐛 Bug report
+description: Raise an issue here if you find a bug.
+title: "[Bug]: "
+labels: ["bug"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: Your current environment
+    description: |
+      Please run the following and paste the output below.
+      ```sh
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      # For security purposes, please feel free to check the contents of collect_env.py before running it.
+      python collect_env.py
+      ```
+      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
+    value: |
+      ```text
+      The output of `python collect_env.py`
+      ```
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: 🐛 Describe the bug
+    description: |
+      Please provide a clear and concise description of what the bug is.
+
+      If relevant, add a minimal example so that we can reproduce the error by running the code. It is very important for the snippet to be as succinct (minimal) as possible, so please take time to trim down any irrelevant code to help us debug efficiently. We are going to copy-paste your code and we expect to get the same result as you did: avoid any external data, and include the relevant imports, etc. For example:
+
+      ```python
+      from vllm import LLM, SamplingParams
+
+      prompts = [
+          "Hello, my name is",
+          "The president of the United States is",
+          "The capital of France is",
+          "The future of AI is",
+      ]
+      sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+      llm = LLM(model="facebook/opt-125m")
+
+      outputs = llm.generate(prompts, sampling_params)
+
+      # Print the outputs.
+      for output in outputs:
+          prompt = output.prompt
+          generated_text = output.outputs[0].text
+          print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+      ```
+
+      If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
+
+      Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
+
+      Please set the environment variable `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging to help debugging potential issues.
+
+      If you experienced crashes or hangs, it would be helpful to run vllm with `export VLLM_TRACE_FUNCTION=1` . All the function calls in vllm will be recorded. Inspect these log files, and tell which function crashes or hangs.
+    placeholder: |
+      A clear and concise description of what the bug is.
+
+      ```python
+      # Sample code to reproduce the problem
+      ```
+
+      ```
+      The error message you got, with the full traceback.
+      ```
+  validations:
+    required: true
+- type: markdown
+  attributes:
+    value: >
+      ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
+
+      - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
+
+      - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
+
+      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/500-feature
+++ b/.github/ISSUE_TEMPLATE/500-feature
@@ -0,0 +1,31 @@
+name: 🚀 Feature request
+description: Submit a proposal/request for a new vllm feature
+title: "[Feature]: "
+labels: ["feature request"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: 🚀 The feature, motivation and pitch
+    description: >
+      A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Alternatives
+    description: >
+      A description of any alternative solutions or features you've considered, if any.
+- type: textarea
+  attributes:
+    label: Additional context
+    description: >
+      Add any other context or screenshots about the feature request.
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/600-new
+++ b/.github/ISSUE_TEMPLATE/600-new
@@ -0,0 +1,33 @@
+name: 🤗 Support request for a new model from huggingface
+description: Submit a proposal/request for a new model from huggingface
+title: "[New Model]: "
+labels: ["new model"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+
+      #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
+- type: textarea
+  attributes:
+    label: The model to consider.
+    description: >
+      A huggingface url, pointing to the model, e.g. https://huggingface.co/openai-community/gpt2 .
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: The closest model vllm already supports.
+    description: >
+      Here is the list of models already supported by vllm: https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models . Which model is the most similar to the model you want to add support for?
+- type: textarea
+  attributes:
+    label: What's your difficulty of supporting the model you want?
+    description: >
+      For example, any new operators or new architecture?
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/700-performance
+++ b/.github/ISSUE_TEMPLATE/700-performance
@@ -0,0 +1,52 @@
+name: ⚡ Discussion on the performance of vllm
+description: Submit a proposal/discussion about the performance of vllm
+title: "[Performance]: "
+labels: ["performance"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: Proposal to improve performance
+    description: >
+      How do you plan to improve vllm's performance?
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: Report of performance regression
+    description: >
+      Please provide detailed description of performance comparison to confirm the regression. You may want to run the benchmark script at https://github.com/vllm-project/vllm/tree/main/benchmarks .
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: Misc discussion on performance
+    description: >
+      Anything about the performance.
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: Your current environment (if you think it is necessary)
+    description: |
+      Please run the following and paste the output below.
+      ```sh
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      # For security purposes, please feel free to check the contents of collect_env.py before running it.
+      python collect_env.py
+      ```
+      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
+    value: |
+      ```text
+      The output of `python collect_env.py`
+      ```
+  validations:
+    required: false
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/750-RFC.yml
+++ b/.github/ISSUE_TEMPLATE/750-RFC.yml
@@ -0,0 +1,49 @@
+name: 💬 Request for comments (RFC).
+description: Ask for feedback on major architectural changes or design choices.
+title: "[RFC]: "
+labels: ["RFC"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference.
+- type: textarea
+  attributes:
+    label: Motivation.
+    description: >
+      The motivation of the RFC.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Proposed Change.
+    description: >
+      The proposed change of the RFC.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Feedback Period.
+    description: >
+      The feedback period of the RFC. Usually at least one week.
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: CC List.
+    description: >
+      The list of people you want to CC.
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: Any Other Things.
+    description: >
+      Any other things you would like to mention.
+  validations:
+    required: false
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/800-misc
+++ b/.github/ISSUE_TEMPLATE/800-misc
@@ -0,0 +1,21 @@
+name: 🎲 Misc/random discussions that do not fit into the above categories.
+description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
+title: "[Misc]: "
+labels: ["misc"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: Anything you want to discuss about vllm.
+    description: >
+      Anything you want to discuss about vllm.
+  validations:
+    required: true
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1 @@
+blank_issues_enabled: false
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,64 @@
+FILL IN THE PR DESCRIPTION HERE
+
+FIX #xxxx (*link existing issues this PR will resolve*)
+
+**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**
+
+---
+
+<details>
+<!-- inside this <details> section, markdown rendering does not work, so we use raw html here. -->
+<summary><b> PR Checklist (Click to Expand) </b></summary>
+
+<p>Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.</p>
+
+<h3>PR Title and Classification</h3>
+<p>Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:</p>
+<ul>
+    <li><code>[Bugfix]</code> for bug fixes.</li>
+    <li><code>[CI/Build]</code> for build or continuous integration improvements.</li>
+    <li><code>[Doc]</code> for documentation fixes and improvements.</li>
+    <li><code>[Model]</code> for adding a new model or improving an existing model. Model name should appear in the title.</li>
+    <li><code>[Frontend]</code> For changes on the vLLM frontend (e.g., OpenAI API server, <code>LLM</code> class, etc.) </li>
+    <li><code>[Kernel]</code> for changes affecting CUDA kernels or other compute kernels.</li>
+    <li><code>[Core]</code> for changes in the core vLLM logic (e.g., <code>LLMEngine</code>, <code>AsyncLLMEngine</code>, <code>Scheduler</code>, etc.)</li>
+    <li><code>[Hardware][Vendor]</code> for hardware-specific changes. Vendor name should appear in the prefix (e.g., <code>[Hardware][AMD]</code>).</li>
+    <li><code>[Misc]</code> for PRs that do not fit the above categories. Please use this sparingly.</li>
+</ul>
+<p><strong>Note:</strong> If the PR spans more than one category, please include all relevant prefixes.</p>
+
+<h3>Code Quality</h3>
+
+<p>The PR need to meet the following code quality standards:</p>
+
+<ul>
+    <li>We adhere to <a href="https://google.github.io/styleguide/pyguide.html">Google Python style guide</a> and <a href="https://google.github.io/styleguide/cppguide.html">Google C++ style guide</a>.</li>
+    <li>Pass all linter checks. Please use <a href="https://github.com/vllm-project/vllm/blob/main/format.sh"><code>format.sh</code></a> to format your code.</li>
+    <li>The code need to be well-documented to ensure future contributors can easily understand the code.</li>
+    <li>Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.</li>
+    <li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
+</ul>
+
+<h3>Notes for Large Changes</h3>
+<p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
+
+<h3>What to Expect for the Reviews</h3>
+
+<p>The goal of the vLLM team is to be a <i>transparent reviewing machine</i>. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process: </p>
+
+<ul>
+    <li> After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.</li>
+    <li> After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.</li>
+    <li> After the review, the reviewer will put an <code> action-required</code> label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.</li>
+    <li> Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.
+ </li>
+</ul>
+
+<h3>Thank You</h3>
+
+<p> Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone! </p>
+
+
+</details>
+
+
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@@ -0,0 +1,42 @@
+name: clang-format
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  clang-format:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.11"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install clang-format==18.1.5
+    - name: Running clang-format
+      run: |
+        EXCLUDES=(
+            'csrc/moe/topk_softmax_kernels.cu'
+            'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
+            'csrc/punica/bgmv/bgmv_config.h'
+            'csrc/punica/bgmv/bgmv_impl.cuh'
+            'csrc/punica/bgmv/vec_dtypes.cuh'
+            'csrc/punica/punica_ops.cu'
+            'csrc/punica/type_convert.h'
+        )
+        find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
+            | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
+            | xargs clang-format --dry-run --Werror
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -0,0 +1,51 @@
+name: mypy
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  ruff:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install mypy==1.9.0
+        pip install types-setuptools
+        pip install types-PyYAML
+        pip install types-requests
+        pip install types-setuptools
+    - name: Mypy
+      run: |
+        mypy vllm/attention --config-file pyproject.toml
+        mypy vllm/core --config-file pyproject.toml
+        mypy vllm/distributed --config-file pyproject.toml
+        mypy vllm/entrypoints --config-file pyproject.toml
+        mypy vllm/executor --config-file pyproject.toml
+        mypy vllm/multimodal --config-file pyproject.toml
+        mypy vllm/usage --config-file pyproject.toml
+        mypy vllm/*.py --config-file pyproject.toml
+        mypy vllm/transformers_utils --config-file pyproject.toml
+        mypy vllm/engine  --config-file pyproject.toml
+        mypy vllm/worker --config-file pyproject.toml
+        mypy vllm/spec_decode --config-file pyproject.toml
+        mypy vllm/model_executor  --config-file pyproject.toml
+        mypy vllm/lora --config-file pyproject.toml
+        mypy vllm/logging --config-file pyproject.toml
+        mypy vllm/model_executor --config-file pyproject.toml
+
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -0,0 +1,110 @@
+# This workflow will upload a Python Package to Release asset
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: Create Release
+
+on:
+  push:
+    tags:
+      - v*
+
+# Needed to create release and upload assets
+permissions:
+  contents: write
+
+jobs:
+  release:
+    # Retrieve tag and create release
+    name: Create Release
+    runs-on: ubuntu-latest
+    outputs:
+      upload_url: ${{ steps.create_release.outputs.upload_url }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Extract branch info
+        shell: bash
+        run: |
+          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
+
+      - name: Create Release
+        id: create_release
+        uses: "actions/github-script@v6"
+        env:
+          RELEASE_TAG: ${{ env.release_tag }}
+        with:
+          github-token: "${{ secrets.GITHUB_TOKEN }}"
+          script: |
+            const script = require('.github/workflows/scripts/create_release.js')
+            await script(github, context, core)
+
+  wheel:
+    name: Build Wheel
+    runs-on: ${{ matrix.os }}
+    needs: release
+
+    strategy:
+      fail-fast: false
+      matrix:
+          os: ['ubuntu-20.04']
+          python-version: ['3.8', '3.9', '3.10', '3.11']
+          pytorch-version: ['2.3.0']  # Must be the most recent version that meets requirements-cuda.txt.
+          cuda-version: ['11.8', '12.1']
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Setup ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+        with:
+          create-symlink: true
+          key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
+
+      - name: Set up Linux Env
+        if: ${{ runner.os == 'Linux' }}
+        run: |
+          bash -x .github/workflows/scripts/env.sh
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+            python-version: ${{ matrix.python-version }}
+
+      - name: Install CUDA ${{ matrix.cuda-version }}
+        run: |
+          bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
+
+      - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
+        run: |
+          bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
+
+      - name: Build wheel
+        shell: bash
+        env:
+          CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
+        run: |
+          bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
+          wheel_name=$(ls dist/*whl | xargs -n 1 basename)
+          asset_name=${wheel_name//"linux"/"manylinux1"}
+          echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
+          echo "asset_name=${asset_name}" >> $GITHUB_ENV
+
+      - name: Upload Release Asset
+        uses: actions/upload-release-asset@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ needs.release.outputs.upload_url }}
+          asset_path: ./dist/${{ env.wheel_name }}
+          asset_name: ${{ env.asset_name }}
+          asset_content_type: application/*
+
+      # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
+      # - name: Publish package
+      #   uses: pypa/gh-action-pypi-publish@release/v1.8
+      #   with:
+      #     repository-url: https://test.pypi.org/legacy/
+      #     password: ${{ secrets.PYPI_API_TOKEN }}
+      #     skip-existing: true
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -0,0 +1,37 @@
+name: ruff
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  ruff:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
+    - name: Analysing the code with ruff
+      run: |
+        ruff .
+    - name: Spelling check with codespell
+      run: |
+        codespell --toml pyproject.toml
+    - name: Run isort
+      run: |
+        isort . --check-only
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+python_executable=python$1
+cuda_home=/usr/local/cuda-$2
+
+# Update paths
+PATH=${cuda_home}/bin:$PATH
+LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
+
+# Install requirements
+$python_executable -m pip install wheel packaging
+$python_executable -m pip install -r requirements-cuda.txt
+
+# Limit the number of parallel jobs to avoid OOM
+export MAX_JOBS=1
+# Make sure punica is built for the release (for LoRA)
+export VLLM_INSTALL_PUNICA_KERNELS=1
+# Make sure release wheels are built for the following architectures
+export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+# Build
+$python_executable setup.py bdist_wheel --dist-dir=dist
--- a/.github/workflows/scripts/create_release.js
+++ b/.github/workflows/scripts/create_release.js
@@ -0,0 +1,20 @@
+// Uses Github's API to create the release and wait for result.
+// We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
+
+module.exports = async (github, context, core) => {
+	try {
+		const response = await github.rest.repos.createRelease({
+			draft: false,
+			generate_release_notes: true,
+			name: process.env.RELEASE_TAG,
+			owner: context.repo.owner,
+			prerelease: true,
+			repo: context.repo.repo,
+			tag_name: process.env.RELEASE_TAG,
+		});
+
+		core.setOutput('upload_url', response.data.upload_url);
+	} catch (error) {
+		core.setFailed(error.message);
+	}
+}
--- a/.github/workflows/scripts/cuda-install.sh
+++ b/.github/workflows/scripts/cuda-install.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# Replace '.' with '-' ex: 11.8 -> 11-8
+cuda_version=$(echo $1 | tr "." "-")
+# Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
+OS=$(echo $2 | tr -d ".\-")
+
+# Installs CUDA
+wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
+rm cuda-keyring_1.1-1_all.deb
+sudo apt -qq update
+sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
+sudo apt clean
+
+# Test nvcc
+PATH=/usr/local/cuda-$1/bin:${PATH}
+nvcc --version
+
+# Log gcc, g++, c++ versions
+gcc --version
+g++ --version
+c++ --version
--- a/.github/workflows/scripts/env.sh
+++ b/.github/workflows/scripts/env.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+# This file installs common linux environment tools
+
+export LANG C.UTF-8
+
+# python_version=$1
+
+sudo    apt-get update && \
+sudo    apt-get install -y --no-install-recommends \
+        software-properties-common \
+
+sudo    apt-get install -y --no-install-recommends \
+        build-essential \
+        apt-utils \
+        ca-certificates \
+        wget \
+        git \
+        vim \
+        libssl-dev \
+        curl \
+        unzip \
+        unrar \
+        cmake \
+        net-tools \
+        sudo \
+        autotools-dev \
+        rsync \
+        jq \
+        openssh-server \
+        tmux \
+        screen \
+        htop \
+        pdsh \
+        openssh-client \
+        lshw \
+        dmidecode \
+        util-linux \
+        automake \
+        autoconf \
+        libtool \
+        net-tools \
+        pciutils \
+        libpci-dev \
+        libaio-dev \
+        libcap2 \
+        libtinfo5 \
+        fakeroot \
+        devscripts \
+        debhelper \
+        nfs-common
+
+# Remove github bloat files to free up disk space
+sudo rm -rf "/usr/local/share/boost"
+sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+sudo rm -rf "/usr/share/dotnet"
--- a/.github/workflows/scripts/pytorch-install.sh
+++ b/.github/workflows/scripts/pytorch-install.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+python_executable=python$1
+pytorch_version=$2
+cuda_version=$3
+
+# Install torch
+$python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
+$python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}
+
+# Print version information
+$python_executable --version
+$python_executable -c "import torch; print('PyTorch:', torch.__version__)"
+$python_executable -c "import torch; print('CUDA:', torch.version.cuda)"
+$python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -0,0 +1,31 @@
+name: yapf
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+jobs:
+  yapf:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install yapf==0.32.0
+        pip install toml==0.10.2
+    - name: Running yapf
+      run: |
+        yapf --diff --recursive .
--- a/.gitignore
+++ b/.gitignore
@@ -1,10 +1,189 @@
-**/*.pyc
-**/__pycache__/
-*.egg-info/
-*.eggs/
-*.so
-build/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class

+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+docs/source/getting_started/examples/*.rst
+!**/*.template.rst
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+# VSCode
+.vscode/
+
+# DS Store
+.DS_Store
+
+# Results
+*.csv
+
+# Python pickle files
 *.pkl
-*.png
-**/log.txt
+
+# Sphinx documentation
+_build/
+
+# vim swap files
+*.swo
+*.swp
+
+# hip files generated by PyTorch
+*.hip
+*_hip*
+hip_compat.h
+
+# Benchmark dataset
+*.json
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -0,0 +1,21 @@
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+version: 2
+
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.8"
+
+sphinx:
+   configuration: docs/source/conf.py
+
+# If using Sphinx, optionally build your docs in additional formats such as PDF
+formats:
+   - pdf
+
+# Optionally declare the Python requirements required to build your docs
+python:
+   install:
+   - requirements: docs/requirements-docs.txt
--- a/.yapfignore
+++ b/.yapfignore
@@ -0,0 +1 @@
+collect_env.py
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,315 @@
+cmake_minimum_required(VERSION 3.21)
+
+project(vllm_extensions LANGUAGES CXX)
+
+option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")
+
+message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
+message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
+
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
+
+#
+# Supported python versions.  These versions will be searched in order, the
+# first match will be selected.  These should be kept in sync with setup.py.
+#
+set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
+
+# Supported NVIDIA architectures.
+set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
+
+# Supported AMD GPU architectures.
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
+
+#
+# Supported/expected torch versions for CUDA/ROCm.
+#
+# Currently, having an incorrect pytorch version results in a warning
+# rather than an error.
+#
+# Note: the CUDA torch version is derived from pyproject.toml and various
+# requirements.txt files and should be kept consistent.  The ROCm torch
+# versions are derived from Dockerfile.rocm
+#
+set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
+set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
+set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
+
+#
+# Try to find python package with an executable that exactly matches
+# `VLLM_PYTHON_EXECUTABLE` and is one of the supported versions.
+#
+if (VLLM_PYTHON_EXECUTABLE)
+  find_python_from_executable(${VLLM_PYTHON_EXECUTABLE} "${PYTHON_SUPPORTED_VERSIONS}")
+else()
+  message(FATAL_ERROR
+    "Please set VLLM_PYTHON_EXECUTABLE to the path of the desired python version"
+    " before running cmake configure.")
+endif()
+
+#
+# Update cmake's `CMAKE_PREFIX_PATH` with torch location.
+#
+append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
+
+# Ensure the 'nvcc' command is in the PATH
+find_program(NVCC_EXECUTABLE nvcc)
+if (CUDA_FOUND AND NOT NVCC_EXECUTABLE)
+    message(FATAL_ERROR "nvcc not found")
+endif()
+
+#
+# Import torch cmake configuration.
+# Torch also imports CUDA (and partially HIP) languages with some customizations,
+# so there is no need to do this explicitly with check_language/enable_language,
+# etc.
+#
+find_package(Torch REQUIRED)
+
+#
+# Forward the non-CUDA device extensions to external CMake scripts.
+#
+if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
+    NOT VLLM_TARGET_DEVICE STREQUAL "rocm")
+    if (VLLM_TARGET_DEVICE STREQUAL "cpu")
+        include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
+    else()
+        message(FATAL_ERROR "Unsupported vLLM target device: ${VLLM_TARGET_DEVICE}")
+    endif()
+    return()
+endif()
+
+#
+# Set up GPU language and check the torch version and warn if it isn't
+# what is expected.
+#
+if (NOT HIP_FOUND AND CUDA_FOUND)
+  set(VLLM_GPU_LANG "CUDA")
+
+  if (NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_CUDA})
+    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_CUDA} "
+      "expected for CUDA build, saw ${Torch_VERSION} instead.")
+  endif()
+elseif(HIP_FOUND)
+  set(VLLM_GPU_LANG "HIP")
+
+  # Importing torch recognizes and sets up some HIP/ROCm configuration but does
+  # not let cmake recognize .hip files. In order to get cmake to understand the
+  # .hip extension automatically, HIP must be enabled explicitly.
+  enable_language(HIP)
+
+  # ROCm 5.x
+  if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
+      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
+    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} "
+      "expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.")
+  endif()
+
+  # ROCm 6.x
+  if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND
+      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X})
+    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
+      "expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
+  endif()
+else()
+  message(FATAL_ERROR "Can't find CUDA or HIP installation.")
+endif()
+
+#
+# Override the GPU architectures detected by cmake/torch and filter them by
+# the supported versions for the current language.
+# The final set of arches is stored in `VLLM_GPU_ARCHES`.
+#
+override_gpu_arches(VLLM_GPU_ARCHES
+  ${VLLM_GPU_LANG}
+  "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
+
+#
+# Query torch for additional GPU compilation flags for the given
+# `VLLM_GPU_LANG`.
+# The final set of arches is stored in `VLLM_GPU_FLAGS`.
+#
+get_torch_gpu_compiler_flags(VLLM_GPU_FLAGS ${VLLM_GPU_LANG})
+
+#
+# Set nvcc parallelism.
+#
+if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
+  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
+endif()
+
+#
+# Define extension targets
+#
+
+#
+# _C extension
+#
+
+set(VLLM_EXT_SRC
+  "csrc/cache_kernels.cu"
+  "csrc/attention/attention_kernels.cu"
+  "csrc/pos_encoding_kernels.cu"
+  "csrc/activation_kernels.cu"
+  "csrc/layernorm_kernels.cu"
+  "csrc/quantization/squeezellm/quant_cuda_kernel.cu"
+  "csrc/quantization/gptq/q_gemm.cu"
+  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
+  "csrc/quantization/fp8/common.cu"
+  "csrc/cuda_utils_kernels.cu"
+  "csrc/moe_align_block_size_kernels.cu"
+  "csrc/torch_bindings.cpp")
+
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  include(FetchContent)
+  SET(CUTLASS_ENABLE_HEADERS_ONLY=ON)
+  FetchContent_Declare(
+        cutlass
+        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
+        # CUTLASS 3.5.0
+        GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
+  )
+  FetchContent_MakeAvailable(cutlass)
+
+  list(APPEND VLLM_EXT_SRC
+    "csrc/quantization/aqlm/gemm_kernels.cu"
+    "csrc/quantization/awq/gemm_kernels.cu"
+    "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
+    "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
+    "csrc/quantization/gptq_marlin/gptq_marlin.cu"
+    "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
+    "csrc/custom_all_reduce.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu")
+
+  #
+  # The CUTLASS kernels for Hopper require sm90a to be enabled.
+  # This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
+  # That adds an extra 17MB to compiled binary, so instead we selectively enable it.
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
+    set_source_files_properties(
+          "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu"
+          PROPERTIES
+          COMPILE_FLAGS
+          "-gencode arch=compute_90a,code=sm_90a")
+  endif()
+
+endif()
+
+define_gpu_extension_target(
+  _C
+  DESTINATION vllm
+  LANGUAGE ${VLLM_GPU_LANG}
+  SOURCES ${VLLM_EXT_SRC}
+  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+  ARCHITECTURES ${VLLM_GPU_ARCHES}
+  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
+  USE_SABI 3
+  WITH_SOABI)
+
+#
+# _moe_C extension
+#
+
+set(VLLM_MOE_EXT_SRC
+  "csrc/moe/torch_bindings.cpp"
+  "csrc/moe/topk_softmax_kernels.cu")
+
+define_gpu_extension_target(
+  _moe_C
+  DESTINATION vllm
+  LANGUAGE ${VLLM_GPU_LANG}
+  SOURCES ${VLLM_MOE_EXT_SRC}
+  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+  ARCHITECTURES ${VLLM_GPU_ARCHES}
+  USE_SABI 3
+  WITH_SOABI)
+
+#
+# _punica_C extension
+#
+
+set(VLLM_PUNICA_EXT_SRC
+  "csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
+  "csrc/punica/punica_ops.cu"
+  "csrc/punica/torch_bindings.cpp")
+
+#
+# Copy GPU compilation flags+update for punica
+#
+set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS})
+list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
+  "-D__CUDA_NO_HALF_OPERATORS__"
+  "-D__CUDA_NO_HALF_CONVERSIONS__"
+  "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
+  "-D__CUDA_NO_HALF2_OPERATORS__")
+
+#
+# Filter out CUDA architectures < 8.0 for punica.
+#
+if (${VLLM_GPU_LANG} STREQUAL "CUDA")
+  set(VLLM_PUNICA_GPU_ARCHES)
+  foreach(ARCH ${VLLM_GPU_ARCHES})
+    string_to_ver(CODE_VER ${ARCH})
+    if (CODE_VER GREATER_EQUAL 8.0)
+      list(APPEND VLLM_PUNICA_GPU_ARCHES ${ARCH})
+    endif()
+  endforeach()
+  message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
+elseif(${VLLM_GPU_LANG} STREQUAL "HIP")
+  set(VLLM_PUNICA_GPU_ARCHES ${VLLM_GPU_ARCHES})
+  message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
+endif()
+
+if (VLLM_PUNICA_GPU_ARCHES)
+  define_gpu_extension_target(
+    _punica_C
+    DESTINATION vllm
+    LANGUAGE ${VLLM_GPU_LANG}
+    SOURCES ${VLLM_PUNICA_EXT_SRC}
+    COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
+    ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
+    USE_SABI 3
+    WITH_SOABI)
+else()
+  message(WARNING "Unable to create _punica_C target because none of the "
+    "requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")
+endif()
+
+#
+# Add the `default` target which detects which extensions should be
+# built based on platform/architecture.  This is the same logic that
+# setup.py uses to select which extensions should be built and should
+# be kept in sync.
+#
+# The `default` target makes direct use of cmake easier since knowledge
+# of which extensions are supported has been factored in, e.g.
+#
+# mkdir build && cd build
+# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
+# cmake --build . --target default
+#
+add_custom_target(default)
+
+if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
+  message(STATUS "Enabling C extension.")
+  add_dependencies(default _C)
+
+  message(STATUS "Enabling moe extension.")
+  add_dependencies(default _moe_C)
+
+  # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
+  # VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
+  # there are supported target arches.
+  if (VLLM_PUNICA_GPU_ARCHES AND
+      (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS))
+    message(STATUS "Enabling punica extension.")
+    add_dependencies(default _punica_C)
+  endif()
+endif()
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -0,0 +1,56 @@
+# Contributing to vLLM
+
+Thank you for your interest in contributing to vLLM!
+Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
+There are several ways you can contribute to the project:
+
+- Identify and report any issues or bugs.
+- Request or add a new model.
+- Suggest or implement new features.
+
+However, remember that contributions aren't just about code.
+We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
+
+Finally, one of the most impactful ways to support us is by raising awareness about vLLM.
+Talk about it in your blog posts, highlighting how it's driving your incredible projects.
+Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
+
+
+## Setup for development
+
+### Build from source
+
+```bash
+pip install -e .  # This may take several minutes.
+```
+
+### Testing
+
+```bash
+pip install -r requirements-dev.txt
+
+# linting and formatting
+bash format.sh
+# Static type checking
+mypy
+# Unit tests
+pytest tests/
+```
+**Note:** Currently, the repository does not pass the mypy tests.
+
+
+## Contributing Guidelines
+
+### Issue Reporting
+
+If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
+If not, please file a new issue, providing as much relevant information as possible.
+
+### Pull Requests & Code Reviews
+
+Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution.
+
+### Thank You
+
+Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
+Your contributions make vLLM a great tool for everyone!
--- a/136
+++ b/136
@@ -0,0 +1,136 @@
+# The vLLM Dockerfile is used to construct vLLM image that can be directly used
+# to run the OpenAI compatible server.
+
+# Please update any changes made here to
+# docs/source/dev/dockerfile/dockerfile.rst and
+# docs/source/assets/dev/dockerfile-stages-dependency.png
+
+#################### BASE BUILD IMAGE ####################
+# prepare basic build environment
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev
+
+RUN apt-get update -y \
+    && apt-get install -y python3-pip git
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-12.4/compat/
+
+WORKDIR /workspace
+
+# install build and runtime dependencies
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cuda.txt requirements-cuda.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements-cuda.txt
+
+# install development dependencies
+COPY requirements-dev.txt requirements-dev.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements-dev.txt
+
+# cuda arch list used by torch
+# can be useful for both `dev` and `test`
+# explicitly set the list to avoid issues with torch 2.2
+# see https://github.com/pytorch/pytorch/pull/123243
+ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
+#################### BASE BUILD IMAGE ####################
+
+
+#################### WHEEL BUILD IMAGE ####################
+FROM dev AS build
+
+# install build dependencies
+COPY requirements-build.txt requirements-build.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements-build.txt
+
+# install compiler cache to speed up compilation leveraging local or remote caching
+RUN apt-get update -y && apt-get install -y ccache
+
+# files and directories related to build wheels
+COPY csrc csrc
+COPY setup.py setup.py
+COPY cmake cmake
+COPY CMakeLists.txt CMakeLists.txt
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cuda.txt requirements-cuda.txt
+COPY pyproject.toml pyproject.toml
+COPY vllm vllm
+
+# max jobs used by Ninja to build extensions
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
+# number of threads used by nvcc
+ARG nvcc_threads=8
+ENV NVCC_THREADS=$nvcc_threads
+# make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
+
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/pip \
+    python3 setup.py bdist_wheel --dist-dir=dist
+
+# check the size of the wheel, we cannot upload wheels larger than 100MB
+COPY .buildkite/check-wheel-size.py check-wheel-size.py
+RUN python3 check-wheel-size.py dist
+
+#################### EXTENSION Build IMAGE ####################
+
+#################### vLLM installation IMAGE ####################
+# image with vLLM installed
+FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
+WORKDIR /vllm-workspace
+
+RUN apt-get update -y \
+    && apt-get install -y python3-pip git vim
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-12.4/compat/
+
+# install vllm wheel first, so that torch etc will be installed
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
+    --mount=type=cache,target=/root/.cache/pip \
+    pip install dist/*.whl --verbose
+#################### vLLM installation IMAGE ####################
+
+
+#################### TEST IMAGE ####################
+# image to run unit testing suite
+# note that this uses vllm installed by `pip`
+FROM vllm-base AS test
+
+ADD . /vllm-workspace/
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements-dev.txt
+
+# doc requires source code
+# we hide them inside `test_docs/` , so that this source code
+# will not be imported by other tests
+RUN mkdir test_docs
+RUN mv docs test_docs/
+RUN mv vllm test_docs/
+
+#################### TEST IMAGE ####################
+
+#################### OPENAI API SERVER ####################
+# openai api server alternative
+FROM vllm-base AS vllm-openai
+
+# install additional dependencies for openai api server
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install accelerate hf_transfer modelscope
+
+ENV VLLM_USAGE_SOURCE production-docker-image
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+#################### OPENAI API SERVER ####################
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -0,0 +1,26 @@
+# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
+
+FROM ubuntu:22.04 AS cpu-test-1
+
+RUN apt-get update  -y \
+    && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+RUN pip install --upgrade pip \
+    && pip install wheel packaging ninja "setuptools>=49.4.0" numpy
+
+FROM cpu-test-1 AS build
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+
+RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests  && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+CMD ["/bin/bash"]
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -0,0 +1,36 @@
+# default base image
+ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"
+
+FROM $BASE_IMAGE
+
+RUN echo "Base image is $BASE_IMAGE"
+
+# Install some basic utilities
+RUN apt-get update && apt-get install python3 python3-pip -y
+
+### Mount Point ###
+# When launching the container, mount the code directory to /app
+ARG APP_MOUNT=/app
+VOLUME [ ${APP_MOUNT} ]
+WORKDIR ${APP_MOUNT}
+
+RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
+RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
+RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+
+COPY ./vllm /app/vllm/vllm
+COPY ./setup.py /app/vllm/setup.py
+COPY ./requirements-common.txt /app/vllm/requirements-common.txt
+COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
+
+RUN cd /app/vllm \
+    && python3 -m pip install -U -r requirements-neuron.txt
+
+ENV VLLM_TARGET_DEVICE neuron
+RUN cd /app/vllm \
+    && pip install -e . \
+    && cd ..
+
+CMD ["/bin/bash"]
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -0,0 +1,115 @@
+# default base image
+ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+
+FROM $BASE_IMAGE
+
+ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+
+RUN echo "Base image is $BASE_IMAGE"
+
+# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
+# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+
+
+ARG FA_GFX_ARCHS="gfx90a;gfx942"
+RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
+
+ARG FA_BRANCH="ae7928c"
+RUN echo "FA_BRANCH is $FA_BRANCH"
+
+# whether to build flash-attention
+# if 0, will not build flash attention
+# this is useful for gfx target where flash-attention is not supported
+# In that case, we need to use the python reference attention implementation in vllm
+ARG BUILD_FA="1"
+
+# whether to build triton on rocm
+ARG BUILD_TRITON="1"
+
+# Install some basic utilities
+RUN apt-get update && apt-get install python3 python3-pip -y
+
+# Install some basic utilities
+RUN apt-get update && apt-get install -y \
+    curl \
+    ca-certificates \
+    sudo \
+    git \
+    bzip2 \
+    libx11-6 \
+    build-essential \
+    wget \
+    unzip \
+    nvidia-cuda-toolkit \
+    tmux \
+ && rm -rf /var/lib/apt/lists/*
+
+### Mount Point ###
+# When launching the container, mount the code directory to /app
+ARG APP_MOUNT=/vllm-workspace
+VOLUME [ ${APP_MOUNT} ]
+WORKDIR ${APP_MOUNT}
+
+RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
+
+ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
+ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
+ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
+
+# Install ROCm flash-attention
+RUN if [ "$BUILD_FA" = "1" ]; then \
+    mkdir libs \
+    && cd libs \
+    && git clone https://github.com/ROCm/flash-attention.git \
+    && cd flash-attention \
+    && git checkout ${FA_BRANCH} \
+    && git submodule update --init \
+    && export GPU_ARCHS=${FA_GFX_ARCHS} \
+    && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
+        patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
+    && python3 setup.py install \
+    && cd ..; \
+    fi
+
+# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
+# Manually removed it so that later steps of numpy upgrade can continue
+RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
+    rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
+
+# build triton
+RUN if [ "$BUILD_TRITON" = "1" ]; then \
+    mkdir -p libs \
+    && cd libs \
+    && pip uninstall -y triton \
+    && git clone https://github.com/ROCm/triton.git \
+    && cd triton/python \
+    && pip3 install . \
+    && cd ../..; \
+    fi
+
+WORKDIR /vllm-workspace
+COPY . .
+
+#RUN python3 -m pip install pynvml # to be removed eventually
+RUN python3 -m pip install --upgrade pip numba
+
+# make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
+# Workaround for ray >= 2.10.0
+ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+
+ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -U -r requirements-rocm.txt \
+    && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
+    && python3 setup.py install \
+    && cp build/lib.linux-x86_64-cpython-39/vllm/_C.abi3.so vllm/ \
+    && cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.abi3.so vllm/ \
+    && cp build/lib.linux-x86_64-cpython-39/vllm/_moe_C.abi3.so vllm/ \
+    && cd ..
+
+
+CMD ["/bin/bash"]
--- a/201
+++ b/201
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -0,0 +1,10 @@
+include LICENSE
+include requirements-common.txt
+include requirements-cuda.txt
+include requirements-rocm.txt
+include requirements-neuron.txt
+include requirements-cpu.txt
+include CMakeLists.txt
+
+recursive-include cmake *
+recursive-include csrc *
--- a/README.md
+++ b/README.md
@@ -1,72 +1,134 @@
-# CacheFlow
+<p align="center">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png">
+    <img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png" width=55%>
+  </picture>
+</p>

-## Installation
+<h3 align="center">
+Easy, fast, and cheap LLM serving for everyone
+</h3>
+
+<p align="center">
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> |
+
+</p>
+
+---
+
+**Ray Summit CPF is Open (June 4th to June 20th)!**
+
+There will be a track for vLLM at the Ray Summit (09/30-10/02, SF) this year!
+If you have cool projects related to vLLM or LLM inference, we would love to see your proposals.
+This will be a great chance for everyone in the community to get together and learn.
+Please submit your proposal [here](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/eventsite)
+
+**The Fourth vLLM Bay Area Meetup (June 11th 5:30pm-8pm PT)**
+
+We are thrilled to announce our fourth vLLM Meetup!
+The vLLM team will share recent updates and roadmap.
+We will also have vLLM collaborators from BentoML and Cloudflare coming up to the stage to discuss their experience in deploying LLMs with vLLM.
+Please register [here](https://lu.ma/agivllm) and join us!
+
+---
+
+*Latest News* 🔥
+- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
+- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
+- [2024/01] Added ROCm 6.0 support to vLLM.
+- [2023/12] Added ROCm 5.7 support to vLLM.
+- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
+- [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
+- [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
+- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
+- [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
+- [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
+- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
+
+---
+## About
+vLLM is a fast and easy-to-use library for LLM inference and serving.
+
+vLLM is fast with:
+
+- State-of-the-art serving throughput
+- Efficient management of attention key and value memory with **PagedAttention**
+- Continuous batching of incoming requests
+- Fast model execution with CUDA/HIP graph
+- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
+- Optimized CUDA kernels
+
+vLLM is flexible and easy to use with:
+
+- Seamless integration with popular Hugging Face models
+- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
+- Tensor parallelism support for distributed inference
+- Streaming outputs
+- OpenAI-compatible API server
+- Support NVIDIA GPUs and AMD GPUs
+- (Experimental) Prefix caching support
+- (Experimental) Multi-lora support
+
+vLLM seamlessly supports most popular open-source models on HuggingFace, including:
+- Transformer-like LLMs (e.g., Llama)
+- Mixture-of-Expert LLMs (e.g., Mixtral)
+- Multi-modal LLMs (e.g., LLaVA)
+
+Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
+
+## Getting Started
+
+Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):

 ```bash
-pip install psutil numpy ray torch
-pip install git+https://github.com/huggingface/transformers  # Required for LLaMA.
-pip install sentencepiece  # Required for LlamaTokenizer.
-pip install ninja  # To parallelize the compilation of flash-attn.
-pip install flash-attn  # This may take up to 10 mins.
-pip install -e .
+pip install vllm
 ```

-## Test simple server
+Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
+- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
+- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
+- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)

-```bash
-ray start --head
-python simple_server.py
+## Contributing
+
+We welcome and value any contributions and collaborations.
+Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
+
+## Sponsors
+
+vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
+
+<!-- Note: Please sort them in alphabetical order. -->
+<!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
+
+- a16z
+- AMD
+- Anyscale
+- AWS
+- Crusoe Cloud
+- Databricks
+- DeepInfra
+- Dropbox
+- Lambda Lab
+- NVIDIA
+- Replicate
+- Roblox
+- RunPod
+- Sequoia Capital
+- Trainy
+- UC Berkeley
+- UC San Diego
+
+We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
+
+## Citation
+
+If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
+```bibtex
+@inproceedings{kwon2023efficient,
+  title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
+  author={Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and Hao Zhang and Ion Stoica},
+  booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles},
+  year={2023}
+}
 ```
-
-The detailed arguments for `simple_server.py` can be found by:
-```bash
-python simple_server.py --help
-```
-
-## FastAPI server
-
-Install the following additional dependencies:
-```bash
-pip install fastapi uvicorn
-```
-
-To start the server:
-```bash
-ray start --head
-python -m cacheflow.http_frontend.fastapi_frontend
-```
-
-To test the server:
-```bash
-python -m cacheflow.http_frontend.test_cli_client
-```
-
-## Gradio web server
-
-Install the following additional dependencies:
-```bash
-pip install gradio
-```
-
-Start the server:
-```bash
-python -m cacheflow.http_frontend.fastapi_frontend
-# At another terminal
-python -m cacheflow.http_frontend.gradio_webserver
-```
-
-## Load LLaMA weights
-
-Since LLaMA weight is not fully public, we cannot directly download the LLaMA weights from huggingface. Therefore, you need to follow the following process to load the LLaMA weights.
-
-1. Converting LLaMA weights to huggingface format with [this script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py).
-    ```bash
-    python src/transformers/models/llama/convert_llama_weights_to_hf.py \
-        --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path/llama-7b
-    ```
-    Please make sure that `llama` is included in the output directory name.
-2. For all the commands above, specify the model with `--model /output/path/llama-7b` to load the model. For example:
-    ```bash
-    python simple_server.py --model /output/path/llama-7b
-    python -m cacheflow.http_frontend.fastapi_frontend --model /output/path/llama-7b
-    ```
--- a/benchmark/benchmark_attention.py
+++ b/benchmark/benchmark_attention.py
@@ -1,165 +0,0 @@
-import functools
-import random
-import time
-from typing import List
-
-from flash_attn.flash_attn_interface import _flash_attn_forward
-import torch
-
-from cacheflow import attention_ops
-
-
-def benchmark(name, f, num_warmup = 10, num_iters = 100):
-    for _ in range(num_warmup):
-        f()
-    torch.cuda.synchronize()
-
-    start = time.time()
-    for _ in range(num_iters):
-        f()
-    torch.cuda.synchronize()
-    end = time.time()
-    print(f'{name}: {(end - start) / num_iters * 1000:.3f} ms')
-
-
-@torch.inference_mode()
-def benchmark_multi_query_cached_kv_attention(
-    query_lens: List[int],
-    context_lens: List[int],
-    num_heads: int,
-    head_size: int,
-    block_size: int,
-    num_blocks: int,
-    dtype: torch.dtype,
-) -> None:
-    print(f'query_lens: {query_lens}, context_lens: {context_lens}, '
-          f'num_heads: {num_heads}, head_size: {head_size}, block_size: '
-          f'{block_size}, num_blocks: {num_blocks}, dtype: {dtype}')
-    # Create query tensor.
-    num_queries = len(query_lens)
-    cu_query_lens = [0]
-    for query_len in query_lens:
-        cu_query_lens.append(cu_query_lens[-1] + query_len)
-    num_total_tokens = cu_query_lens[-1]
-    qkv = torch.randn(
-        num_total_tokens, 3, num_heads, head_size, dtype=dtype, device='cuda')
-    query, _, _ = qkv.unbind(dim=1)
-
-    # Create key and value cache.
-    x = 16 // torch.tensor([], dtype=dtype).element_size()
-    key_block_shape = (num_heads, head_size // x, block_size, x)
-    key_cache = torch.randn(
-        size=(num_blocks, *key_block_shape), dtype=dtype, device='cuda')
-    value_block_shape = (num_heads, head_size, block_size)
-    value_cache = torch.randn(
-        size=(num_blocks, *value_block_shape), dtype=dtype, device='cuda')
-
-    # Create block tables.
-    max_context_len = max(context_lens)
-    max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
-    block_tables = []
-    for _ in range(num_queries):
-        block_table = [
-            random.randint(0, num_blocks - 1)
-            for _ in range(max_num_blocks_per_seq)
-        ]
-        block_tables.append(block_table)
-    block_tables = torch.tensor(block_tables, dtype=torch.int, device='cuda')
-
-    # Create input and output data structures.
-    cu_query_lens = torch.tensor(cu_query_lens, dtype=torch.int, device='cuda')
-    context_len_tensor = torch.tensor(context_lens, dtype=torch.int, device='cuda')
-    scale = float(1.0 / (head_size ** 0.5))
-    output = torch.empty(
-        num_total_tokens, num_heads, head_size, dtype=dtype, device='cuda')
-
-    # Run our implementation.
-    def run_ours():
-        attention_ops.multi_query_cached_kv_attention(
-            cu_query_lens,
-            output,
-            query,
-            key_cache,
-            value_cache,
-            scale,
-            block_tables,
-            context_len_tensor,
-            block_size,
-            max_context_len,
-        )
-    benchmark('Ours', run_ours)
-
-    # Upper bound: Flash attention.
-    # Becuase Flash attention cannot read our own cache,
-    # we make key and value tensors contiguous.
-    num_kv_tokens = sum(context_lens)
-    cu_context_lens = [0]
-    for context_len in context_lens:
-        cu_context_lens.append(cu_context_lens[-1] + context_len)
-    cu_context_lens = torch.tensor(cu_context_lens, dtype=torch.int, device='cuda')
-    qkv = torch.randn(
-        num_kv_tokens, 3, num_heads, head_size, dtype=dtype, device='cuda')
-    _, key, value = qkv.unbind(dim=1)
-    ref_output = torch.empty_like(output)
-
-    # Run Flash attention.
-    def run_flash_attn():
-        _flash_attn_forward(
-            query,
-            key,
-            value,
-            ref_output,
-            cu_query_lens,
-            cu_context_lens,
-            max(query_lens),
-            max_context_len,
-            dropout_p=0.0,
-            softmax_scale=scale,
-            causal=True,
-            return_softmax=False,
-        )
-    benchmark('Flash attention', run_flash_attn)
-
-
-if __name__ == '__main__':
-    BLOCK_SIZE = 8
-    NUM_BLOCKS = 1024
-    DTYPE = torch.half
-
-    # LLaMA-13B and OPT-13B
-    NUM_HEADS = 40
-    HEAD_SIZE = 128
-
-    run_benchmark = functools.partial(
-        benchmark_multi_query_cached_kv_attention,
-        num_heads=NUM_HEADS,
-        head_size=HEAD_SIZE,
-        block_size=BLOCK_SIZE,
-        num_blocks=NUM_BLOCKS,
-        dtype=DTYPE,
-    )
-
-    run_benchmark(
-        query_lens=[64] * 1,
-        context_lens=[64] * 1,
-    )
-    run_benchmark(
-        query_lens=[128] * 1,
-        context_lens=[128] * 1,
-    )
-    run_benchmark(
-        query_lens=[64] * 8,
-        context_lens=[64] * 8,
-    )
-    run_benchmark(
-        query_lens=[128] * 8,
-        context_lens=[128] * 8,
-    )
-    run_benchmark(
-        query_lens=[64, 32, 16],
-        context_lens=[128, 256, 64],
-    )
-    run_benchmark(
-        query_lens=[1024],
-        context_lens=[1024],
-    )
--- a/benchmark/benchmark_cache.py
+++ b/benchmark/benchmark_cache.py
@@ -1,81 +0,0 @@
-import functools
-import random
-import time
-
-import torch
-
-from cacheflow import cache_ops
-
-
-def benchmark(name, f, size: int, num_warmup = 10, num_iters = 100):
-    for _ in range(num_warmup):
-        f()
-    torch.cuda.synchronize()
-
-    start = time.time()
-    for _ in range(num_iters):
-        f()
-    torch.cuda.synchronize()
-    end = time.time()
-    avg_time = (end - start) / num_iters
-    print(f'[Latency] {name}: {avg_time * 1000:.3f} ms')
-    print(f'[Throughput] {name}: {size / avg_time / 2 ** 30:.3f} GB/s')
-
-
-@torch.inference_mode()
-def test_gather_cached_kv(
-    num_tokens: int,
-    num_heads: int,
-    head_size: int,
-    block_size: int,
-    num_blocks: int,
-    dtype: torch.dtype,
-) -> None:
-    print(f'num_tokens: {num_tokens}, num_heads: {num_heads}, '
-          f'head_size: {head_size}, block_size: {block_size}, '
-          f'num_blocks: {num_blocks}, dtype: {dtype}')
-
-    num_slots = block_size * num_blocks
-    slot_mapping = random.sample(range(num_slots), num_tokens)
-    slot_mapping = torch.tensor(slot_mapping, dtype=torch.int, device='cuda')
-
-    qkv = torch.randn(
-        num_tokens, 3, num_heads, head_size, dtype=dtype, device='cuda')
-    _, key, value = qkv.unbind(dim=1)
-
-    x = 16 // torch.tensor([], dtype=dtype).element_size()
-    key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
-    key_cache = torch.randn(size=key_cache_shape, dtype=dtype, device='cuda')
-
-    value_cache_shape = (num_blocks, num_heads, head_size, block_size)
-    value_cache = torch.randn(
-        size=value_cache_shape, dtype=dtype, device='cuda')
-
-    # Run Flash attention.
-    def run():
-        cache_ops.gather_cached_kv(key, value, key_cache, value_cache, slot_mapping)
-
-    benchmark('gather_cached_kv', run,
-              size=num_tokens * num_heads * head_size * 2 * qkv.element_size())
-
-
-if __name__ == '__main__':
-    BLOCK_SIZE = 8
-    NUM_BLOCKS = 1024
-    DTYPE = torch.half
-
-    # LLaMA-13B and OPT-13B
-    NUM_HEADS = 40
-    HEAD_SIZE = 128
-
-    run_benchmark = functools.partial(
-        test_gather_cached_kv,
-        num_heads=NUM_HEADS,
-        head_size=HEAD_SIZE,
-        block_size=BLOCK_SIZE,
-        num_blocks=NUM_BLOCKS,
-        dtype=DTYPE,
-    )
-
-    for i in range(6, 12):
-        run_benchmark(num_tokens=2 ** i)
--- a/benchmark/benchmark_latency.py
+++ b/benchmark/benchmark_latency.py
@@ -1,105 +0,0 @@
-import argparse
-import time
-from typing import List
-
-from tqdm import tqdm
-import numpy as np
-import torch
-
-from cacheflow.master.simple_frontend import SimpleFrontend
-from cacheflow.master.server import (Server, add_server_arguments,
-                                     initialize_ray_cluster)
-from cacheflow.sampling_params import SamplingParams
-from cacheflow.utils import get_gpu_memory, get_cpu_memory
-
-
-def main(args: argparse.Namespace):
-    # TODO(zhuohan): Support pipeline parallelism.
-    assert args.pipeline_parallel_size == 1, (
-        'Pipeline parallelism is not supported yet.')
-
-    (num_nodes, num_devices_per_node, distributed_init_method,
-    all_stage_devices) = (
-        initialize_ray_cluster(
-            address='local',
-            pipeline_parallel_size=args.pipeline_parallel_size,
-            tensor_parallel_size=args.tensor_parallel_size))
-
-    # Create a server.
-    server = Server(
-        model=args.model,
-        model_path=args.model_path,
-        use_dummy_weights=args.use_dummy_weights,
-        pipeline_parallel_size=args.pipeline_parallel_size,
-        tensor_parallel_size=args.tensor_parallel_size,
-        block_size=args.block_size,
-        dtype=args.dtype,
-        seed=args.seed,
-        swap_space=args.swap_space,
-        max_num_batched_tokens=args.max_num_batched_tokens,
-        max_num_sequences=args.max_num_sequences,
-        num_nodes=num_nodes,
-        num_devices_per_node=num_devices_per_node,
-        distributed_init_method=distributed_init_method,
-        all_stage_devices=all_stage_devices,
-        gpu_memory=get_gpu_memory(),
-        cpu_memory=get_cpu_memory(),
-    )
-
-    # Create a frontend.
-    frontend = SimpleFrontend(
-        model_name=args.model,
-        block_size=args.block_size,
-    )
-    sampling_params_dict = {
-        'n': args.n,
-        'temperature': 0.0 if args.use_beam_search else 1.0,
-        'top_p': 1.0,
-        'use_beam_search': args.use_beam_search,
-        'stop_token_ids': set(),
-        'max_num_steps': args.output_len,
-    }
-    sampling_params = SamplingParams.from_dict(sampling_params_dict)
-    print(sampling_params)
-    input_token_ids = [0] * args.input_len
-
-    def profile_step(profile=False):
-        if profile:
-            torch.cuda.cudart().cudaProfilerStart()
-        for _ in range(args.batch_size):
-            frontend._add_query(input_token_ids, sampling_params)
-        server.add_sequence_groups(frontend.get_inputs())
-        start_time = time.time()
-        while True:
-            server.step()
-            if not server.has_unfinished_requests():
-                break
-        end_time = time.time()
-        latency = end_time - start_time
-        if profile:
-            torch.cuda.cudart().cudaProfilerStop()
-        return latency
-
-    print("Warm up step")
-    profile_step()
-
-    # Benchmark.
-    latencies = []
-    for _ in tqdm(range(3), desc="Profile step"):
-        latencies.append(profile_step())
-    print(f'Avg latency: {np.mean(latencies)} seconds')
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='CacheFlow simple server.')
-    parser = add_server_arguments(parser)
-    parser.add_argument('--input-len', type=int, default=32)
-    parser.add_argument('--output-len', type=int, default=128)
-    parser.add_argument('--batch-size', type=int, default=8)
-    parser.add_argument('--n', type=int, default=1)
-    parser.add_argument('--use-beam-search', action='store_true')
-    args = parser.parse_args()
-    args.max_num_batched_tokens = max(
-        args.max_num_batched_tokens, args.batch_size * args.input_len)
-    print(args)
-    main(args)
--- a/benchmark/benchmark_text_completion.py
+++ b/benchmark/benchmark_text_completion.py
@@ -1,290 +0,0 @@
-import argparse
-import logging
-import os
-import pickle
-import time
-from typing import List
-
-from tqdm import tqdm
-from transformers import AutoConfig
-
-from benchmark.trace import generate_text_completion_requests
-from cacheflow.master.simple_frontend import SimpleFrontend
-from cacheflow.master.server import (Server, add_server_arguments,
-                                     initialize_ray_cluster)
-from cacheflow.sampling_params import SamplingParams
-from cacheflow.utils import get_gpu_memory, get_cpu_memory
-
-
-logger = logging.getLogger(__name__)
-
-
-def main(args: argparse.Namespace):
-    assert args.pipeline_parallel_size == 1, (
-        'Pipeline parallelism is not supported yet.')
-
-    (num_nodes, num_devices_per_node, distributed_init_method,
-    all_stage_devices) = (
-        initialize_ray_cluster(
-            address='local',
-            pipeline_parallel_size=args.pipeline_parallel_size,
-            tensor_parallel_size=args.tensor_parallel_size))
-
-    # Create a server.
-    server = Server(
-        model=args.model,
-        model_path=args.model_path,
-        use_dummy_weights=args.use_dummy_weights,
-        pipeline_parallel_size=args.pipeline_parallel_size,
-        tensor_parallel_size=args.tensor_parallel_size,
-        block_size=args.block_size,
-        dtype=args.dtype,
-        seed=args.seed,
-        swap_space=args.swap_space,
-        max_num_batched_tokens=args.max_num_batched_tokens,
-        max_num_sequences=args.max_num_sequences,
-        num_nodes=num_nodes,
-        num_devices_per_node=num_devices_per_node,
-        distributed_init_method=distributed_init_method,
-        all_stage_devices=all_stage_devices,
-        gpu_memory=get_gpu_memory(),
-        cpu_memory=get_cpu_memory(),
-        collect_stats=True,
-        do_memory_analysis=args.do_memory_analysis,
-    )
-
-    # Create a frontend.
-    frontend = SimpleFrontend(
-        model_name=args.model,
-        block_size=args.block_size,
-    )
-    # Generate requests.
-    requests = generate_text_completion_requests(
-        args.dataset,
-        args.request_rate,
-        args.duration,
-        args.seed,
-        args.n1,
-        args.n2,
-        args.n3,
-        args.n4,
-        args.n6,
-        args.n2_beam,
-        args.n4_beam,
-        args.n6_beam,
-        args.n8_beam,
-    )
-
-    # Warm up.
-    logger.info('Warming up.')
-    num_warmup_requests = 8
-    warmup_input_len = 8
-    warmup_output_len = 32
-    warmup_sampling_params = SamplingParams(
-        n=1,
-        temperature=1.0,
-        top_p=0.99,
-        max_num_steps=warmup_output_len,
-        use_beam_search=False,
-        stop_token_ids=set(),
-        num_logprobs=0,
-        context_window_size=None,
-    )
-    for _ in range(num_warmup_requests):
-        frontend._add_query([0] * warmup_input_len, warmup_sampling_params)
-    server.add_sequence_groups(frontend.get_inputs())
-    while True:
-        server.step()
-        if not server.has_unfinished_requests():
-            break
-
-    # Start benchmarking.
-    logger.info('Start benchmarking.')
-    # Initialize tqdm.
-    pbar = tqdm(total=len(requests), desc='Finished requests')
-
-    finished = []
-    server.scheduler.reset_stats()
-    start_time = time.time()
-    while True:
-        now = time.time()
-        if args.timeout is not None and now - start_time > args.timeout:
-            logger.info('Timeout. Stop benchmarking.')
-            break
-
-        while requests:
-            if requests[0][0] <= now - start_time:
-                request_time, input_tokens, sampling_params = requests.pop(0)
-                frontend._add_query(
-                    input_tokens, sampling_params, arrival_time=start_time + request_time)
-            else:
-                break
-        server.add_sequence_groups(frontend.get_inputs())
-        updated_seq_groups = server.step()
-
-        now = time.time()
-        for seq_group in updated_seq_groups:
-            if not seq_group.is_finished():
-                continue
-            arrival_time = seq_group.arrival_time
-            finish_time = now
-            for seq in seq_group.get_seqs():
-                seq_len = seq.get_len()
-                output_len = seq_len - seq.prompt_len
-                finished.append({
-                    'group_id': seq_group.group_id,
-                    'seq_id': seq.seq_id,
-                    'arrival_time': arrival_time, 
-                    'finish_time': finish_time,
-                    'prompt_len': seq.prompt_len,
-                    'output_len': output_len,
-                })
-            pbar.update(1)
-
-        if not (requests or server.has_unfinished_requests()):
-            break
-    pbar.close()
-    logger.info('Finish benchmarking. Saving stats.')
-    server.scheduler.save_stats(args.output_dir)
-    with open(os.path.join(args.output_dir, 'sequences.pkl'), 'wb') as f:
-        pickle.dump(finished, f)
-    logger.info('Done.')
-
-
-def get_model_name(model: str) -> str:
-    OPT_MODELS = [
-        'opt-125m',
-        'opt-350m',
-        'opt-1.3b',
-        'opt-2.7b',
-        'opt-6.7b',
-        'opt-13b',
-        'opt-30b',
-        'opt-66b',
-        'opt-175b',
-    ]
-    for opt_model in OPT_MODELS:
-        if opt_model in model:
-            return opt_model
-
-    config = AutoConfig.from_pretrained(model)
-    assert config.model_type == 'llama'
-    hidden_size = config.hidden_size
-    if hidden_size == 4096:
-        return 'llama-7b'
-    elif hidden_size == 5120:
-        return 'llama-13b'
-    elif hidden_size == 6656:
-        return 'llama-30b'
-    elif hidden_size == 8192:
-        return 'llama-65b'
-    else:
-        raise ValueError(f'Unknown model: {model}')
-
-
-def get_dataset_name(dataset: str) -> str:
-    if 'sharegpt' in dataset.lower():
-        return 'sharegpt'
-    elif 'alpaca' in dataset.lower():
-        return 'alpaca'
-    else:
-        raise ValueError(f'Unknown dataset: {dataset}')
-
-
-def get_sampling_dir_name(
-    n1: float,
-    n2: float,
-    n3: float,
-    n4: float,
-    n6: float,
-    n2_beam: float,
-    n4_beam: float,
-    n6_beam: float,
-    n8_beam: float,
-) -> str:
-    method = ''
-    if n1 > 0.0:
-        method = 'n1' if n1 == 1.0 else method + f'n1-{n1}-'
-    if n2 > 0.0:
-        method = 'n2' if n2 == 1.0 else method + f'n2-{n2}-'
-    if n3 > 0.0:
-        method = 'n3' if n3 == 1.0 else method + f'n3-{n3}-'
-    if n4 > 0.0:
-        method = 'n4' if n4 == 1.0 else method + f'n4-{n4}-'
-    if n6 > 0.0:
-        method = 'n6' if n6 == 1.0 else method + f'n6-{n6}-'
-    if n2_beam > 0.0:
-        method = 'n2-beam' if n2_beam == 1.0 else method + f'n2-beam-{n2_beam}-'
-    if n4_beam > 0.0:
-        method = 'n4-beam' if n4_beam == 1.0 else method + f'n4-beam-{n4_beam}-'
-    if n6_beam > 0.0:
-        method = 'n6-beam' if n6_beam == 1.0 else method + f'n6-beam-{n6_beam}-'
-    if n8_beam > 0.0:
-        method = 'n8-beam' if n8_beam == 1.0 else method + f'n8-beam-{n8_beam}-'
-    return method[:-1] if method.endswith('-') else method
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='CacheFlow simple server.')
-    parser = add_server_arguments(parser) 
-    parser.add_argument('--output-dir', type=str, help='path to output directory', default=None)
-
-    parser.add_argument('--dataset', type=str, help='path to dataset', required=True)
-    parser.add_argument('--request-rate', type=float, help='reqs/sec', required=True)
-    parser.add_argument('--duration', type=int, help='duration in seconds', required=True)
-    parser.add_argument('--do-memory-analysis', action='store_true',
-        help='do memory analysis (This will lower the throughput. Use this only for analysis.)')
-    parser.add_argument('--timeout', type=int, help='time out in seconds', default=None)
-
-    parser.add_argument('--n1', type=float, help='ratio of requests with n=1', default=0.0)
-    parser.add_argument('--n2', type=float, help='ratio of requests with n=2', default=0.0)
-    parser.add_argument('--n3', type=float, help='ratio of requests with n=3', default=0.0)
-    parser.add_argument('--n4', type=float, help='ratio of requests with n=4', default=0.0)
-    parser.add_argument('--n6', type=float, help='ratio of requests with n=6', default=0.0)
-    parser.add_argument('--n2-beam', type=float, help='ratio of requests with n=2 & beam search', default=0.0)
-    parser.add_argument('--n4-beam', type=float, help='ratio of requests with n=4 & beam search', default=0.0)
-    parser.add_argument('--n6-beam', type=float, help='ratio of requests with n=6 & beam search', default=0.0)
-    parser.add_argument('--n8-beam', type=float, help='ratio of requests with n=8 & beam search', default=0.0)
-    args = parser.parse_args()
-    if args.n1 + args.n2 + args.n3 + args.n4 + args.n6 + args.n2_beam + args.n4_beam + args.n6_beam + args.n8_beam != 1.0:
-        raise ValueError('The ratios of requests must sum to 1.')
-
-    model_name = get_model_name(args.model)
-    dataset_name = get_dataset_name(args.dataset)
-    if 'opt' in model_name:
-        if 'opt' not in args.dataset.lower():
-            raise ValueError(f'OPT models can only be used with OPT datasets.')
-    elif 'llama' in model_name:
-        if 'llama' not in args.dataset.lower():
-            raise ValueError(f'Llama models can only be used with Llama datasets.')
-
-    dataset_name = 'sharegpt' if 'sharegpt' in args.dataset else 'alpaca'
-    sample_dir = get_sampling_dir_name(
-        args.n1, args.n2, args.n3, args.n4, args.n6, args.n2_beam, args.n4_beam, args.n6_beam, args.n8_beam)
-    if args.output_dir is None:
-        args.output_dir = os.path.join(
-            '../exp',
-            dataset_name,
-            f'{model_name}-tp{args.tensor_parallel_size}',
-            sample_dir,
-            'cacheflow',
-            f'block{args.block_size}',
-            f'req-rate-{args.request_rate}',
-            f'seed{args.seed}',
-            f'duration-{args.duration}',
-        )
-    os.makedirs(args.output_dir, exist_ok=True)
-
-    # Set up logging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-        handlers=[
-            logging.StreamHandler(),
-            logging.FileHandler(os.path.join(args.output_dir, 'log.txt')),
-        ],
-    )
-    logger.info(args)
-
-    main(args)
--- a/benchmark/trace.py
+++ b/benchmark/trace.py
@@ -1,116 +0,0 @@
-import pickle
-import random
-from typing import List, Tuple
-
-import numpy as np
-
-from cacheflow.sampling_params import SamplingParams
-
-
-def generate_text_completion_requests(
-    dataset: str,
-    request_rate: float,
-    duration: int,
-    seed: int,
-    n1: float = 0.0,
-    n2: float = 0.0,
-    n3: float = 0.0,
-    n4: float = 0.0,
-    n6: float = 0.0,
-    n2_beam: float = 0.0,
-    n4_beam: float = 0.0,
-    n6_beam: float = 0.0,
-    n8_beam: float = 0.0,
-    max_seq_len: int = 2048,
-    time_quantum: int = 10,
-) -> List[Tuple[float, List[int], SamplingParams]]:
-    random.seed(seed)
-    np.random.seed(seed)
-
-    # Generate timestamps for requests using Poisson distribution.
-    lam = request_rate * (time_quantum / 1000)
-    quantums_per_sec = 1000 / time_quantum
-    arrival_times = np.random.poisson(
-        lam=lam, size=int(duration * quantums_per_sec))
-    timestamps = []
-    for i, n in enumerate(arrival_times):
-        timestamps += [i * (time_quantum / 1000)] * n
-
-    # Load and shuffle the dataset.
-    num_requests = len(timestamps)
-    with open(dataset, 'rb') as f:
-        data = pickle.load(f)
-
-    filtered = []
-    for pair in data:
-        input_tokens, output_tokens = pair
-        input_len = len(input_tokens)
-        output_len = len(output_tokens)
-        # Filter out too long sequences.
-        if input_len + output_len < max_seq_len:
-            # Output tokens are not needed for the benchmark.
-            filtered.append((input_tokens, output_len))
-
-    data = []
-    while len(data) < num_requests:
-        data += filtered
-    data = data[:num_requests]
-    # Shuffle the data.
-    assert len(data) == len(timestamps)
-    random.shuffle(data)
-
-    random_sampling_params_dict = {
-        'temperature': 1.0,
-        'top_p': 1.0,
-        'use_beam_search': False,
-        'stop_token_ids': set(),
-        'num_logprobs': 0,
-        'context_window_size': None,
-    }
-    beam_search_params_dict = {
-        'temperature': 0.0,
-        'top_p': 1.0,
-        'use_beam_search': True,
-        'stop_token_ids': set(),
-        'num_logprobs': 0,
-        'context_window_size': None,
-    }
-
-    # Generate requests based on the sampling parameter ratio.
-    requests = []
-    assert n1 + n2 + n3 + n4 + n6 + n2_beam + n4_beam + n6_beam + n8_beam == 1.0
-    cum_sum = 0
-    for timestamp, pair in zip(timestamps, data):
-        input_tokens, output_len = pair
-        if cum_sum < n1 * num_requests:
-            sampling_params = SamplingParams(
-                n=1, max_num_steps=output_len, **random_sampling_params_dict)
-        elif cum_sum < (n1 + n2) * num_requests:
-            sampling_params = SamplingParams(
-                n=2, max_num_steps=output_len, **random_sampling_params_dict)
-        elif cum_sum < (n1 + n2 + n3) * num_requests:
-            sampling_params = SamplingParams(
-                n=3, max_num_steps=output_len, **random_sampling_params_dict)
-        elif cum_sum < (n1 + n2 + n3 + n4) * num_requests:
-            sampling_params = SamplingParams(
-                n=4, max_num_steps=output_len, **random_sampling_params_dict)
-        elif cum_sum < (n1 + n2 + n3 + n4 + n6) * num_requests:
-            sampling_params = SamplingParams(
-                n=6, max_num_steps=output_len, **random_sampling_params_dict)
-        elif cum_sum < (n1 + n2 + n3 + n4 + n6 + n2_beam) * num_requests:
-            sampling_params = SamplingParams(
-                n=2, max_num_steps=output_len, **beam_search_params_dict)
-        elif cum_sum < (n1 + n2 + n3 + n4 + n6 + n2_beam + n4_beam) * num_requests:
-            sampling_params = SamplingParams(
-                n=4, max_num_steps=output_len, **beam_search_params_dict)
-        elif cum_sum < (n1 + n2 + n3 + n4 + n6 + n2_beam + n4_beam + n6_beam) * num_requests:
-            sampling_params = SamplingParams(
-                n=6, max_num_steps=output_len, **beam_search_params_dict)
-        elif cum_sum < (n1 + n2 + n3 + n4 + n6 + n2_beam + n4_beam + n6_beam + n8_beam) * num_requests:
-            sampling_params = SamplingParams(
-                n=8, max_num_steps=output_len, **beam_search_params_dict)
-        else:
-            raise ValueError('Invalid request ratio.')
-        cum_sum += 1
-        requests.append((timestamp, input_tokens, sampling_params))
-    return requests
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -0,0 +1,8 @@
+# Benchmarking vLLM
+
+## Downloading the ShareGPT dataset
+
+You can download the dataset by running:
+```bash
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+```
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -0,0 +1,395 @@
+import json
+import os
+import sys
+import time
+import traceback
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+import aiohttp
+from tqdm.asyncio import tqdm
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+
+@dataclass
+class RequestFuncInput:
+    prompt: str
+    api_url: str
+    prompt_len: int
+    output_len: int
+    model: str
+    best_of: int = 1
+    use_beam_search: bool = False
+
+
+@dataclass
+class RequestFuncOutput:
+    generated_text: str = ""
+    success: bool = False
+    latency: float = 0.0
+    ttft: float = 0.0  # Time to first token
+    itl: List[float] = field(
+        default_factory=list)  # List of inter-token latencies
+    prompt_len: int = 0
+    error: str = ""
+
+
+async def async_request_tgi(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert not request_func_input.use_beam_search
+        params = {
+            "best_of": request_func_input.best_of,
+            "max_new_tokens": request_func_input.output_len,
+            "do_sample": True,
+            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
+            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
+        }
+        payload = {
+            "inputs": request_func_input.prompt,
+            "parameters": params,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                                              "data:")
+
+                        data = json.loads(chunk)
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp -
+                                              most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+                    output.generated_text = data["generated_text"]
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_trt_llm(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert not request_func_input.use_beam_search
+        assert request_func_input.best_of == 1
+        payload = {
+            "accumulate_tokens": True,
+            "text_input": request_func_input.prompt,
+            "temperature": 0.0,
+            "top_p": 1.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                                              "data:")
+
+                        data = json.loads(chunk)
+                        output.generated_text += data["text_output"]
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp -
+                                              most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_deepspeed_mii(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert request_func_input.best_of == 1
+        assert not request_func_input.use_beam_search
+
+        payload = {
+            "prompt": request_func_input.prompt,
+            "max_tokens": request_func_input.output_len,
+            "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
+            "top_p": 1.0,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
+        # will use 0 as placeholder.
+        # See https://github.com/microsoft/DeepSpeed-MII/pull/311
+        output.ttft = 0
+
+        st = time.perf_counter()
+        try:
+            async with session.post(url=request_func_input.api_url,
+                                    json=payload) as response:
+                if response.status == 200:
+                    parsed_resp = await response.json()
+                    output.latency = time.perf_counter() - st
+                    output.generated_text = parsed_resp["text"][0]
+                    output.success = True
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_openai_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        "v1/completions"
+    ), "OpenAI Completions API URL must end with 'v1/completions'."
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert not request_func_input.use_beam_search
+        payload = {
+            "model": request_func_input.model,
+            "prompt": request_func_input.prompt,
+            "temperature": 0.0,
+            "best_of": request_func_input.best_of,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+        }
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                                              "data: ")
+                        if chunk == "[DONE]":
+                            latency = time.perf_counter() - st
+                        else:
+                            data = json.loads(chunk)
+
+                            if data["choices"][0]["text"]:
+                                timestamp = time.perf_counter()
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                # NOTE: Some completion API might have a last
+                                # usage summary response without a token so we
+                                # do not want to include as inter-token-latency
+                                elif data.get("usage", None) is None:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text += data["choices"][0]["text"]
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        "v1/chat/completions"
+    ), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert not request_func_input.use_beam_search
+        payload = {
+            "model": request_func_input.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": request_func_input.prompt,
+                },
+            ],
+            "temperature": 0.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+        }
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                                              "data: ")
+                        if chunk == "[DONE]":
+                            latency = time.perf_counter() - st
+                        else:
+                            timestamp = time.perf_counter()
+                            data = json.loads(chunk)
+
+                            delta = data["choices"][0]["delta"]
+                            if delta.get("content", None):
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+
+                                generated_text += delta["content"]
+
+                            most_recent_timestamp = timestamp
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
+# introduced in Python 3.9
+def remove_prefix(text: str, prefix: str) -> str:
+    if text.startswith(prefix):
+        return text[len(prefix):]
+    return text
+
+
+ASYNC_REQUEST_FUNCS = {
+    "tgi": async_request_tgi,
+    "vllm": async_request_openai_completions,
+    "lmdeploy": async_request_openai_completions,
+    "deepspeed-mii": async_request_deepspeed_mii,
+    "openai": async_request_openai_completions,
+    "openai-chat": async_request_openai_chat_completions,
+    "tensorrt-llm": async_request_trt_llm,
+}
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -0,0 +1,233 @@
+"""Benchmark the latency of processing a single batch of requests."""
+import argparse
+import json
+import time
+from pathlib import Path
+from typing import List, Optional
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from vllm import LLM, SamplingParams
+from vllm.inputs import PromptStrictInputs
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    # NOTE(woosuk): If the request cannot be processed in a single batch,
+    # the engine will automatically process the request in multiple batches.
+    llm = LLM(model=args.model,
+              speculative_model=args.speculative_model,
+              num_speculative_tokens=args.num_speculative_tokens,
+              tokenizer=args.tokenizer,
+              quantization=args.quantization,
+              tensor_parallel_size=args.tensor_parallel_size,
+              trust_remote_code=args.trust_remote_code,
+              dtype=args.dtype,
+              enforce_eager=args.enforce_eager,
+              kv_cache_dtype=args.kv_cache_dtype,
+              quantization_param_path=args.quantization_param_path,
+              device=args.device,
+              ray_workers_use_nsight=args.ray_workers_use_nsight,
+              use_v2_block_manager=args.use_v2_block_manager,
+              enable_chunked_prefill=args.enable_chunked_prefill,
+              download_dir=args.download_dir,
+              block_size=args.block_size,
+              gpu_memory_utilization=args.gpu_memory_utilization,
+              distributed_executor_backend=args.distributed_executor_backend)
+
+    sampling_params = SamplingParams(
+        n=args.n,
+        temperature=0.0 if args.use_beam_search else 1.0,
+        top_p=1.0,
+        use_beam_search=args.use_beam_search,
+        ignore_eos=True,
+        max_tokens=args.output_len,
+    )
+    print(sampling_params)
+    dummy_prompt_token_ids = np.random.randint(10000,
+                                               size=(args.batch_size,
+                                                     args.input_len))
+    dummy_inputs: List[PromptStrictInputs] = [{
+        "prompt_token_ids": batch
+    } for batch in dummy_prompt_token_ids.tolist()]
+
+    def run_to_completion(profile_dir: Optional[str] = None):
+        if profile_dir:
+            with torch.profiler.profile(
+                    activities=[
+                        torch.profiler.ProfilerActivity.CPU,
+                        torch.profiler.ProfilerActivity.CUDA,
+                    ],
+                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                        str(profile_dir))) as p:
+                llm.generate(dummy_inputs,
+                             sampling_params=sampling_params,
+                             use_tqdm=False)
+            print(p.key_averages())
+        else:
+            start_time = time.perf_counter()
+            llm.generate(dummy_inputs,
+                         sampling_params=sampling_params,
+                         use_tqdm=False)
+            end_time = time.perf_counter()
+            latency = end_time - start_time
+            return latency
+
+    print("Warming up...")
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        run_to_completion(profile_dir=None)
+
+    if args.profile:
+        profile_dir = args.profile_result_dir
+        if not profile_dir:
+            profile_dir = Path(
+                "."
+            ) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
+        print(f"Profiling (results will be saved to '{profile_dir}')...")
+        run_to_completion(profile_dir=profile_dir)
+        return
+
+    # Benchmark.
+    latencies = []
+    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
+        latencies.append(run_to_completion(profile_dir=None))
+    latencies = np.array(latencies)
+    percentages = [10, 25, 50, 75, 90]
+    percentiles = np.percentile(latencies, percentages)
+    print(f'Avg latency: {np.mean(latencies)} seconds')
+    for percentage, percentile in zip(percentages, percentiles):
+        print(f'{percentage}% percentile latency: {percentile} seconds')
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "avg_latency": np.mean(latencies),
+            "latencies": latencies.tolist(),
+            "percentiles": dict(zip(percentages, percentiles.tolist())),
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Benchmark the latency of processing a single batch of '
+        'requests till completion.')
+    parser.add_argument('--model', type=str, default='facebook/opt-125m')
+    parser.add_argument('--speculative-model', type=str, default=None)
+    parser.add_argument('--num-speculative-tokens', type=int, default=None)
+    parser.add_argument('--tokenizer', type=str, default=None)
+    parser.add_argument('--quantization',
+                        '-q',
+                        choices=[*QUANTIZATION_METHODS, None],
+                        default=None)
+    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
+    parser.add_argument('--input-len', type=int, default=32)
+    parser.add_argument('--output-len', type=int, default=128)
+    parser.add_argument('--batch-size', type=int, default=8)
+    parser.add_argument('--n',
+                        type=int,
+                        default=1,
+                        help='Number of generated sequences per prompt.')
+    parser.add_argument('--use-beam-search', action='store_true')
+    parser.add_argument('--num-iters-warmup',
+                        type=int,
+                        default=10,
+                        help='Number of iterations to run for warmup.')
+    parser.add_argument('--num-iters',
+                        type=int,
+                        default=30,
+                        help='Number of iterations to run.')
+    parser.add_argument('--trust-remote-code',
+                        action='store_true',
+                        help='trust remote code from huggingface')
+    parser.add_argument(
+        '--dtype',
+        type=str,
+        default='auto',
+        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
+        help='data type for model weights and activations. '
+        'The "auto" option will use FP16 precision '
+        'for FP32 and FP16 models, and BF16 precision '
+        'for BF16 models.')
+    parser.add_argument('--enforce-eager',
+                        action='store_true',
+                        help='enforce eager mode and disable CUDA graph')
+    parser.add_argument(
+        '--kv-cache-dtype',
+        type=str,
+        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
+        default="auto",
+        help='Data type for kv cache storage. If "auto", will use model '
+        'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
+        'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
+    parser.add_argument(
+        '--quantization-param-path',
+        type=str,
+        default=None,
+        help='Path to the JSON file containing the KV cache scaling factors. '
+        'This should generally be supplied, when KV cache dtype is FP8. '
+        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
+        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
+        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
+        'instead supported for common inference criteria.')
+    parser.add_argument(
+        '--profile',
+        action='store_true',
+        help='profile the generation process of a single batch')
+    parser.add_argument(
+        '--profile-result-dir',
+        type=str,
+        default=None,
+        help=('path to save the pytorch profiler output. Can be visualized '
+              'with ui.perfetto.dev or Tensorboard.'))
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        choices=["cuda", "cpu"],
+        help='device type for vLLM execution, supporting CUDA and CPU.')
+    parser.add_argument('--block-size',
+                        type=int,
+                        default=16,
+                        help='block size of key/value cache')
+    parser.add_argument(
+        '--enable-chunked-prefill',
+        action='store_true',
+        help='If True, the prefill requests can be chunked based on the '
+        'max_num_batched_tokens')
+    parser.add_argument('--use-v2-block-manager', action='store_true')
+    parser.add_argument(
+        "--ray-workers-use-nsight",
+        action='store_true',
+        help="If specified, use nsight to profile ray workers",
+    )
+    parser.add_argument('--download-dir',
+                        type=str,
+                        default=None,
+                        help='directory to download and load the weights, '
+                        'default to the default cache dir of huggingface')
+    parser.add_argument(
+        '--output-json',
+        type=str,
+        default=None,
+        help='Path to save the latency results in JSON format.')
+    parser.add_argument('--gpu-memory-utilization',
+                        type=float,
+                        default=0.9,
+                        help='the fraction of GPU memory to be used for '
+                        'the model executor, which can range from 0 to 1.'
+                        'If unspecified, will use the default value of 0.9.')
+    parser.add_argument(
+        '--distributed-executor-backend',
+        choices=['ray', 'mp'],
+        default=None,
+        help='Backend to use for distributed serving. When more than 1 GPU '
+        'is used, will be automatically set to "ray" if installed '
+        'or "mp" (multiprocessing) otherwise.')
+    args = parser.parse_args()
+    main(args)
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -0,0 +1,62 @@
+import argparse
+import time
+
+from vllm import LLM, SamplingParams
+
+PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n"  # noqa: E501
+
+
+def test_prefix(llm=None, sampling_params=None, prompts=None):
+    start_time = time.time()
+
+    llm.generate(prompts, sampling_params=sampling_params)
+
+    end_time = time.time()
+    print(f"cost time {end_time - start_time}")
+
+
+def main(args):
+    llm = LLM(model=args.model,
+              tokenizer_mode='auto',
+              trust_remote_code=True,
+              enforce_eager=True,
+              use_v2_block_manager=args.use_v2_block_manager,
+              tensor_parallel_size=args.tensor_parallel_size,
+              enable_prefix_caching=args.enable_prefix_caching)
+
+    num_prompts = 100
+    prompts = [PROMPT] * num_prompts
+    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+
+    print("------warm up------")
+    test_prefix(
+        llm=llm,
+        prompts=prompts,
+        sampling_params=sampling_params,
+    )
+
+    print("------start generating------")
+    test_prefix(
+        llm=llm,
+        prompts=prompts,
+        sampling_params=sampling_params,
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Benchmark the performance with or without automatic '
+        'prefix caching.')
+    parser.add_argument('--model',
+                        type=str,
+                        default='baichuan-inc/Baichuan2-13B-Chat')
+    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
+    parser.add_argument('--output-len', type=int, default=10)
+    parser.add_argument('--enable-prefix-caching',
+                        action='store_true',
+                        help='enable prefix caching')
+    parser.add_argument('--use-v2-block-manager',
+                        action='store_true',
+                        help='Use BlockSpaceMangerV2')
+    args = parser.parse_args()
+    main(args)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -0,0 +1,644 @@
+"""Benchmark online serving throughput.
+
+On the server side, run one of the following commands:
+    vLLM OpenAI API server
+    python -m vllm.entrypoints.openai.api_server \
+        --model <your_model> --swap-space 16 \
+        --disable-log-requests
+
+    (TGI backend)
+    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
+
+On the client side, run:
+    python benchmarks/benchmark_serving.py \
+        --backend <backend> \
+        --model <your_model> \
+        --dataset-name sharegpt \
+        --dataset-path <path to dataset> \
+        --request-rate <request_rate> \ # By default <request_rate> is inf
+        --num-prompts <num_prompts> # By default <num_prompts> is 1000
+        
+    when using tgi backend, add
+        --endpoint /generate_stream
+    to the end of the command above.
+"""
+import argparse
+import asyncio
+import json
+import os
+import random
+import time
+import warnings
+from dataclasses import dataclass
+from datetime import datetime
+from typing import AsyncGenerator, List, Optional, Tuple
+
+import numpy as np
+from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
+                                  RequestFuncOutput)
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+
+@dataclass
+class BenchmarkMetrics:
+    completed: int
+    total_input: int
+    total_output: int
+    request_throughput: float
+    input_throughput: float
+    output_throughput: float
+    mean_ttft_ms: float
+    median_ttft_ms: float
+    p99_ttft_ms: float
+    mean_tpot_ms: float
+    median_tpot_ms: float
+    p99_tpot_ms: float
+    mean_itl_ms: float
+    median_itl_ms: float
+    p99_itl_ms: float
+
+
+def sample_sharegpt_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int] = None,
+) -> List[Tuple[str, int, int]]:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [(data["conversations"][0]["value"],
+                data["conversations"][1]["value"]) for data in dataset]
+
+    # Shuffle the dataset.
+    random.shuffle(dataset)
+
+    # Filter out sequences that are too long or too short
+    filtered_dataset: List[Tuple[str, int, int]] = []
+    for i in range(len(dataset)):
+        if len(filtered_dataset) == num_requests:
+            break
+
+        # Tokenize the prompts and completions.
+        prompt = dataset[i][0]
+        prompt_token_ids = tokenizer(prompt).input_ids
+        completion = dataset[i][1]
+        completion_token_ids = tokenizer(completion).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = len(completion_token_ids
+                         ) if fixed_output_len is None else fixed_output_len
+        if prompt_len < 4 or output_len < 4:
+            # Prune too short sequences.
+            continue
+        if prompt_len > 1024 or prompt_len + output_len > 2048:
+            # Prune too long sequences.
+            continue
+        filtered_dataset.append((prompt, prompt_len, output_len))
+
+    return filtered_dataset
+
+
+def sample_sonnet_requests(
+    dataset_path: str,
+    num_requests: int,
+    input_len: int,
+    output_len: int,
+    prefix_len: int,
+    tokenizer: PreTrainedTokenizerBase,
+) -> List[Tuple[str, str, int, int]]:
+    assert (
+        input_len > prefix_len
+    ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        poem_lines = f.readlines()
+
+    # Tokenize the poem lines.
+    poem_token_ids = tokenizer(poem_lines).input_ids
+    average_poem_len = sum(
+        len(token_ids) for token_ids in poem_token_ids) / len(poem_token_ids)
+
+    # Base prefix for all requests.
+    base_prompt = "Pick as many lines as you can from these poem lines:\n"
+    base_message = [{
+        "role": "user",
+        "content": base_prompt,
+    }]
+    base_prompt_formatted = tokenizer.apply_chat_template(
+        base_message, add_generation_prompt=True, tokenize=False)
+    base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids)
+
+    assert (
+        input_len > base_prompt_offset
+    ), f"Please set 'args.sonnet-input-len' higher than {base_prompt_offset}."
+    num_input_lines = round(
+        (input_len - base_prompt_offset) / average_poem_len)
+
+    # First approximately `prefix_len` number of tokens in the
+    # prompt are fixed poem lines.
+    assert (
+        prefix_len > base_prompt_offset
+    ), f"Please set 'args.sonnet-prefix-len' higher than {base_prompt_offset}."
+
+    num_prefix_lines = round(
+        (prefix_len - base_prompt_offset) / average_poem_len)
+    prefix_lines = poem_lines[:num_prefix_lines]
+
+    # Sample the rest of lines per request.
+    sampled_requests: List[Tuple[str, int, int]] = []
+    for _ in range(num_requests):
+        sampled_lines = "".join(
+            prefix_lines +
+            random.sample(poem_lines, num_input_lines - num_prefix_lines))
+
+        prompt = f"{base_prompt}{sampled_lines}"
+        message = [
+            {
+                "role": "user",
+                "content": prompt,
+            },
+        ]
+        prompt_formatted = tokenizer.apply_chat_template(
+            message, add_generation_prompt=True, tokenize=False)
+        prompt_len = len(tokenizer(prompt_formatted).input_ids)
+        sampled_requests.append(
+            (prompt, prompt_formatted, prompt_len, output_len))
+
+    return sampled_requests
+
+
+async def get_request(
+    input_requests: List[Tuple[str, int, int]],
+    request_rate: float,
+) -> AsyncGenerator[Tuple[str, int, int], None]:
+    input_requests = iter(input_requests)
+    for request in input_requests:
+        yield request
+
+        if request_rate == float("inf"):
+            # If the request rate is infinity, then we don't need to wait.
+            continue
+        # Sample the request interval from the exponential distribution.
+        interval = np.random.exponential(1.0 / request_rate)
+        # The next request will be sent after the interval.
+        await asyncio.sleep(interval)
+
+
+def calculate_metrics(
+    input_requests: List[Tuple[str, int, int]],
+    outputs: List[RequestFuncOutput],
+    dur_s: float,
+    tokenizer: PreTrainedTokenizerBase,
+) -> Tuple[BenchmarkMetrics, List[int]]:
+    actual_output_lens = []
+    total_input = 0
+    completed = 0
+    itls = []
+    tpots = []
+    ttfts = []
+    for i in range(len(outputs)):
+        if outputs[i].success:
+            # We use the tokenizer to count the number of output tokens for all
+            # serving backends instead of looking at len(outputs[i].itl) since
+            # multiple output tokens may be bundled together
+            # Note: this may inflate the output token count slightly
+            output_len = len(
+                tokenizer(outputs[i].generated_text,
+                          add_special_tokens=False).input_ids)
+            actual_output_lens.append(output_len)
+            total_input += input_requests[i][1]
+            if output_len > 1:
+                tpots.append(
+                    (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
+            itls += outputs[i].itl
+            ttfts.append(outputs[i].ttft)
+            completed += 1
+        else:
+            actual_output_lens.append(0)
+
+    if completed == 0:
+        warnings.warn(
+            "All requests failed. This is likely due to a misconfiguration "
+            "on the benchmark arguments.",
+            stacklevel=2)
+    metrics = BenchmarkMetrics(
+        completed=completed,
+        total_input=total_input,
+        total_output=sum(actual_output_lens),
+        request_throughput=completed / dur_s,
+        input_throughput=total_input / dur_s,
+        output_throughput=sum(actual_output_lens) / dur_s,
+        mean_ttft_ms=np.mean(ttfts or 0) *
+        1000,  # ttfts is empty if streaming is not supported by backend
+        median_ttft_ms=np.median(ttfts or 0) * 1000,
+        p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
+        mean_tpot_ms=np.mean(tpots or 0) * 1000,
+        median_tpot_ms=np.median(tpots or 0) * 1000,
+        p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
+        mean_itl_ms=np.mean(itls or 0) * 1000,
+        median_itl_ms=np.median(itls or 0) * 1000,
+        p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
+    )
+
+    return metrics, actual_output_lens
+
+
+async def benchmark(
+    backend: str,
+    api_url: str,
+    model_id: str,
+    tokenizer: PreTrainedTokenizerBase,
+    input_requests: List[Tuple[str, int, int]],
+    best_of: int,
+    use_beam_search: bool,
+    request_rate: float,
+    disable_tqdm: bool,
+):
+    if backend in ASYNC_REQUEST_FUNCS:
+        request_func = ASYNC_REQUEST_FUNCS.get(backend)
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
+
+    print("Starting initial single prompt test run...")
+    test_prompt, test_prompt_len, test_output_len = input_requests[0]
+    test_input = RequestFuncInput(
+        model=model_id,
+        prompt=test_prompt,
+        api_url=api_url,
+        prompt_len=test_prompt_len,
+        output_len=test_output_len,
+        best_of=best_of,
+        use_beam_search=use_beam_search,
+    )
+    test_output = await request_func(request_func_input=test_input)
+    if not test_output.success:
+        raise ValueError(
+            "Initial test run failed - Please make sure benchmark arguments "
+            f"are correctly specified. Error: {test_output.error}")
+    else:
+        print("Initial test run completed. Starting main benchmark run...")
+    print(f"Traffic request rate: {request_rate}")
+
+    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+
+    benchmark_start_time = time.perf_counter()
+    tasks = []
+    async for request in get_request(input_requests, request_rate):
+        prompt, prompt_len, output_len = request
+        request_func_input = RequestFuncInput(
+            model=model_id,
+            prompt=prompt,
+            api_url=api_url,
+            prompt_len=prompt_len,
+            output_len=output_len,
+            best_of=best_of,
+            use_beam_search=use_beam_search,
+        )
+        tasks.append(
+            asyncio.create_task(
+                request_func(request_func_input=request_func_input,
+                             pbar=pbar)))
+    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
+
+    if not disable_tqdm:
+        pbar.close()
+
+    benchmark_duration = time.perf_counter() - benchmark_start_time
+
+    metrics, actual_output_lens = calculate_metrics(
+        input_requests=input_requests,
+        outputs=outputs,
+        dur_s=benchmark_duration,
+        tokenizer=tokenizer,
+    )
+
+    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
+                                    benchmark_duration))
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+    print("{:<40} {:<10}".format("Total generated tokens:",
+                                 metrics.total_output))
+    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
+                                    metrics.request_throughput))
+    print("{:<40} {:<10.2f}".format("Input token throughput (tok/s):",
+                                    metrics.input_throughput))
+    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
+                                    metrics.output_throughput))
+    print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-'))
+    print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
+    print("{:<40} {:<10.2f}".format("Median TTFT (ms):",
+                                    metrics.median_ttft_ms))
+    print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
+    print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)',
+                               n=50,
+                               c='-'))
+    print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
+    print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
+                                    metrics.median_tpot_ms))
+    print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
+    print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
+    print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
+    print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
+    print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
+    print("=" * 50)
+
+    result = {
+        "duration": benchmark_duration,
+        "completed": metrics.completed,
+        "total_input_tokens": metrics.total_input,
+        "total_output_tokens": metrics.total_output,
+        "request_throughput": metrics.request_throughput,
+        "input_throughput": metrics.input_throughput,
+        "output_throughput": metrics.output_throughput,
+        "mean_ttft_ms": metrics.mean_ttft_ms,
+        "median_ttft_ms": metrics.median_ttft_ms,
+        "p99_ttft_ms": metrics.p99_ttft_ms,
+        "mean_tpot_ms": metrics.mean_tpot_ms,
+        "median_tpot_ms": metrics.median_tpot_ms,
+        "p99_tpot_ms": metrics.p99_tpot_ms,
+        "mean_itl_ms": metrics.mean_itl_ms,
+        "median_itl_ms": metrics.median_itl_ms,
+        "p99_itl_ms": metrics.p99_itl_ms,
+        "input_lens": [output.prompt_len for output in outputs],
+        "output_lens": actual_output_lens,
+        "ttfts": [output.ttft for output in outputs],
+        "itls": [output.itl for output in outputs],
+        "generated_texts": [output.generated_text for output in outputs],
+        "errors": [output.error for output in outputs],
+    }
+    return result
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    backend = args.backend
+    model_id = args.model
+    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+
+    if args.base_url is not None:
+        api_url = f"{args.base_url}{args.endpoint}"
+    else:
+        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
+
+    tokenizer = get_tokenizer(tokenizer_id,
+                              trust_remote_code=args.trust_remote_code)
+
+    if args.dataset is not None:
+        warnings.warn(
+            "The '--dataset' argument will be deprecated in the next "
+            "release. Please use '--dataset-name' and "
+            "'--dataset-path' in the future runs.",
+            stacklevel=2)
+        input_requests = sample_sharegpt_requests(
+            dataset_path=args.dataset,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            fixed_output_len=args.sharegpt_output_len,
+        )
+
+    elif args.dataset_name == "sharegpt":
+        input_requests = sample_sharegpt_requests(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            fixed_output_len=args.sharegpt_output_len,
+        )
+
+    elif args.dataset_name == "sonnet":
+        # Do not format the prompt, pass to message directly
+        if args.backend == "openai-chat":
+            input_requests = sample_sonnet_requests(
+                dataset_path=args.dataset_path,
+                num_requests=args.num_prompts,
+                input_len=args.sonnet_input_len,
+                output_len=args.sonnet_output_len,
+                prefix_len=args.sonnet_prefix_len,
+                tokenizer=tokenizer,
+            )
+            input_requests = [(prompt, prompt_len, output_len)
+                              for prompt, prompt_formatted, prompt_len,
+                              output_len in input_requests]
+        else:
+            assert (
+                tokenizer.chat_template or tokenizer.default_chat_template
+            ), "Tokenizer/model must have chat template for sonnet dataset."
+            input_requests = sample_sonnet_requests(
+                dataset_path=args.dataset_path,
+                num_requests=args.num_prompts,
+                input_len=args.sonnet_input_len,
+                output_len=args.sonnet_output_len,
+                prefix_len=args.sonnet_prefix_len,
+                tokenizer=tokenizer,
+            )
+            input_requests = [(prompt_formatted, prompt_len, output_len)
+                              for prompt, prompt_formatted, prompt_len,
+                              output_len in input_requests]
+
+    else:
+        raise ValueError(f"Unknown dataset: {args.dataset_name}")
+
+    benchmark_result = asyncio.run(
+        benchmark(
+            backend=backend,
+            api_url=api_url,
+            model_id=model_id,
+            tokenizer=tokenizer,
+            input_requests=input_requests,
+            best_of=args.best_of,
+            use_beam_search=args.use_beam_search,
+            request_rate=args.request_rate,
+            disable_tqdm=args.disable_tqdm,
+        ))
+
+    # Save config and results to json
+    if args.save_result:
+        result_json = {}
+
+        # Setup
+        current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
+        result_json["date"] = current_dt
+        result_json["backend"] = backend
+        result_json["model_id"] = model_id
+        result_json["tokenizer_id"] = tokenizer_id
+        result_json["best_of"] = args.best_of
+        result_json["use_beam_search"] = args.use_beam_search
+        result_json["num_prompts"] = args.num_prompts
+
+        # Metadata
+        if args.metadata:
+            for item in args.metadata:
+                if "=" in item:
+                    kvstring = item.split("=")
+                    result_json[kvstring[0].strip()] = kvstring[1].strip()
+                else:
+                    raise ValueError(
+                        "Invalid metadata format. Please use KEY=VALUE format."
+                    )
+
+        # Traffic
+        result_json["request_rate"] = (
+            args.request_rate if args.request_rate < float("inf") else "inf")
+
+        # Merge with benchmark result
+        result_json = {**result_json, **benchmark_result}
+
+        # Save to file
+        base_model_id = model_id.split("/")[-1]
+        file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"  #noqa
+        if args.result_dir:
+            file_name = os.path.join(args.result_dir, file_name)
+        with open(file_name, "w") as outfile:
+            json.dump(result_json, outfile)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Benchmark the online serving throughput.")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="vllm",
+        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+    )
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default=None,
+        help="Server or API base url if not using http host and port.",
+    )
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--endpoint",
+        type=str,
+        default="/v1/completions",
+        help="API endpoint.",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default=None,
+        help="Path to the ShareGPT dataset, will be deprecated in the "
+        "next release.",
+    )
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        default="sharegpt",
+        choices=["sharegpt", "sonnet"],
+        help="Name of the dataset to benchmark on.",
+    )
+    parser.add_argument("--dataset-path",
+                        type=str,
+                        default=None,
+                        help="Path to the dataset.")
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Name of the model.",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        help=
+        "Name or path of the tokenizer, if not using the default tokenizer.",
+    )
+    parser.add_argument(
+        "--best-of",
+        type=int,
+        default=1,
+        help="Generates `best_of` sequences per prompt and "
+        "returns the best one.",
+    )
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=1000,
+        help="Number of prompts to process.",
+    )
+    parser.add_argument(
+        "--sharegpt-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output length "
+        "from the ShareGPT dataset.")
+    parser.add_argument(
+        "--sonnet-input-len",
+        type=int,
+        default=550,
+        help=
+        "Number of input tokens per request, used only for sonnet dataset.",
+    )
+    parser.add_argument(
+        "--sonnet-output-len",
+        type=int,
+        default=150,
+        help=
+        "Number of output tokens per request, used only for sonnet dataset.",
+    )
+    parser.add_argument(
+        "--sonnet-prefix-len",
+        type=int,
+        default=200,
+        help=
+        "Number of prefix tokens per request, used only for sonnet dataset.",
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=float("inf"),
+        help="Number of requests per second. If this is inf, "
+        "then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process to synthesize "
+        "the request arrival times.",
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Trust remote code from huggingface",
+    )
+    parser.add_argument(
+        "--disable-tqdm",
+        action="store_true",
+        help="Specify to disable tqdm progress bar.",
+    )
+    parser.add_argument(
+        "--save-result",
+        action="store_true",
+        help="Specify to save benchmark results to a json file",
+    )
+    parser.add_argument(
+        "--metadata",
+        metavar="KEY=VALUE",
+        nargs="*",
+        help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
+        "for metadata of this run to be saved in the result JSON file "
+        "for record keeping purposes.",
+    )
+    parser.add_argument(
+        "--result-dir",
+        type=str,
+        default=None,
+        help="Specify directory to save benchmark json results."
+        "If not specified, results are saved in the current directory.",
+    )
+
+    args = parser.parse_args()
+    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -0,0 +1,411 @@
+"""Benchmark offline inference throughput."""
+import argparse
+import json
+import random
+import time
+from typing import List, Optional, Tuple
+
+import torch
+from tqdm import tqdm
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          PreTrainedTokenizerBase)
+
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
+
+def sample_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int],
+) -> List[Tuple[str, int, int]]:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [(data["conversations"][0]["value"],
+                data["conversations"][1]["value"]) for data in dataset]
+
+    # Shuffle the dataset.
+    random.shuffle(dataset)
+
+    # Filter out sequences that are too long or too short
+    filtered_dataset: List[Tuple[str, int, int]] = []
+    for i in range(len(dataset)):
+        if len(filtered_dataset) == num_requests:
+            break
+
+        # Tokenize the prompts and completions.
+        prompt = dataset[i][0]
+        prompt_token_ids = tokenizer(prompt).input_ids
+        completion = dataset[i][1]
+        completion_token_ids = tokenizer(completion).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = len(completion_token_ids
+                         ) if fixed_output_len is None else fixed_output_len
+        if prompt_len < 4 or output_len < 4:
+            # Prune too short sequences.
+            continue
+        if prompt_len > 1024 or prompt_len + output_len > 2048:
+            # Prune too long sequences.
+            continue
+        filtered_dataset.append((prompt, prompt_len, output_len))
+
+    return filtered_dataset
+
+
+def run_vllm(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tokenizer: str,
+    quantization: Optional[str],
+    tensor_parallel_size: int,
+    seed: int,
+    n: int,
+    use_beam_search: bool,
+    trust_remote_code: bool,
+    dtype: str,
+    max_model_len: Optional[int],
+    enforce_eager: bool,
+    kv_cache_dtype: str,
+    quantization_param_path: Optional[str],
+    device: str,
+    enable_prefix_caching: bool,
+    enable_chunked_prefill: bool,
+    max_num_batched_tokens: int,
+    distributed_executor_backend: Optional[str],
+    gpu_memory_utilization: float = 0.9,
+    download_dir: Optional[str] = None,
+) -> float:
+    from vllm import LLM, SamplingParams
+    llm = LLM(
+        model=model,
+        tokenizer=tokenizer,
+        quantization=quantization,
+        tensor_parallel_size=tensor_parallel_size,
+        seed=seed,
+        trust_remote_code=trust_remote_code,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=gpu_memory_utilization,
+        enforce_eager=enforce_eager,
+        kv_cache_dtype=kv_cache_dtype,
+        quantization_param_path=quantization_param_path,
+        device=device,
+        enable_prefix_caching=enable_prefix_caching,
+        download_dir=download_dir,
+        enable_chunked_prefill=enable_chunked_prefill,
+        max_num_batched_tokens=max_num_batched_tokens,
+        distributed_executor_backend=distributed_executor_backend,
+    )
+
+    # Add the requests to the engine.
+    prompts = []
+    sampling_params = []
+    for prompt, _, output_len in requests:
+        prompts.append(prompt)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=0.0 if use_beam_search else 1.0,
+                top_p=1.0,
+                use_beam_search=use_beam_search,
+                ignore_eos=True,
+                max_tokens=output_len,
+            ))
+
+    start = time.perf_counter()
+    llm.generate(prompts, sampling_params, use_tqdm=True)
+    end = time.perf_counter()
+    return end - start
+
+
+def run_hf(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tokenizer: PreTrainedTokenizerBase,
+    n: int,
+    use_beam_search: bool,
+    max_batch_size: int,
+    trust_remote_code: bool,
+) -> float:
+    assert not use_beam_search
+    llm = AutoModelForCausalLM.from_pretrained(
+        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
+    if llm.config.model_type == "llama":
+        # To enable padding in the HF backend.
+        tokenizer.pad_token = tokenizer.eos_token
+    llm = llm.cuda()
+
+    pbar = tqdm(total=len(requests))
+    start = time.perf_counter()
+    batch: List[str] = []
+    max_prompt_len = 0
+    max_output_len = 0
+    for i in range(len(requests)):
+        prompt, prompt_len, output_len = requests[i]
+        # Add the prompt to the batch.
+        batch.append(prompt)
+        max_prompt_len = max(max_prompt_len, prompt_len)
+        max_output_len = max(max_output_len, output_len)
+        if len(batch) < max_batch_size and i != len(requests) - 1:
+            # Check if we can add more requests to the batch.
+            _, next_prompt_len, next_output_len = requests[i + 1]
+            if (max(max_prompt_len, next_prompt_len) +
+                    max(max_output_len, next_output_len)) <= 2048:
+                # We can add more requests to the batch.
+                continue
+
+        # Generate the sequences.
+        input_ids = tokenizer(batch, return_tensors="pt",
+                              padding=True).input_ids
+        llm_outputs = llm.generate(
+            input_ids=input_ids.cuda(),
+            do_sample=not use_beam_search,
+            num_return_sequences=n,
+            temperature=1.0,
+            top_p=1.0,
+            use_cache=True,
+            max_new_tokens=max_output_len,
+        )
+        # Include the decoding time.
+        tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
+        pbar.update(len(batch))
+
+        # Clear the batch.
+        batch = []
+        max_prompt_len = 0
+        max_output_len = 0
+    end = time.perf_counter()
+    return end - start
+
+
+def run_mii(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tensor_parallel_size: int,
+    output_len: int,
+) -> float:
+    from mii import client, serve
+    llm = serve(model, tensor_parallel=tensor_parallel_size)
+    prompts = [prompt for prompt, _, _ in requests]
+
+    start = time.perf_counter()
+    llm.generate(prompts, max_new_tokens=output_len)
+    end = time.perf_counter()
+    client = client(model)
+    client.terminate_server()
+    return end - start
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+
+    # Sample the requests.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
+    if args.dataset is None:
+        # Synthesize a prompt with the given input length.
+        prompt = "hi" * (args.input_len - 1)
+        requests = [(prompt, args.input_len, args.output_len)
+                    for _ in range(args.num_prompts)]
+    else:
+        requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
+                                   args.output_len)
+
+    if args.backend == "vllm":
+        elapsed_time = run_vllm(
+            requests, args.model, args.tokenizer, args.quantization,
+            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
+            args.trust_remote_code, args.dtype, args.max_model_len,
+            args.enforce_eager, args.kv_cache_dtype,
+            args.quantization_param_path, args.device,
+            args.enable_prefix_caching, args.enable_chunked_prefill,
+            args.max_num_batched_tokens, args.distributed_executor_backend,
+            args.gpu_memory_utilization, args.download_dir)
+    elif args.backend == "hf":
+        assert args.tensor_parallel_size == 1
+        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
+                              args.use_beam_search, args.hf_max_batch_size,
+                              args.trust_remote_code)
+    elif args.backend == "mii":
+        elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
+                               args.output_len)
+    else:
+        raise ValueError(f"Unknown backend: {args.backend}")
+    total_num_tokens = sum(prompt_len + output_len
+                           for _, prompt_len, output_len in requests)
+    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+          f"{total_num_tokens / elapsed_time:.2f} tokens/s")
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "elapsed_time": elapsed_time,
+            "num_requests": len(requests),
+            "total_num_tokens": total_num_tokens,
+            "requests_per_second": len(requests) / elapsed_time,
+            "tokens_per_second": total_num_tokens / elapsed_time,
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument("--backend",
+                        type=str,
+                        choices=["vllm", "hf", "mii"],
+                        default="vllm")
+    parser.add_argument("--dataset",
+                        type=str,
+                        default=None,
+                        help="Path to the dataset.")
+    parser.add_argument("--input-len",
+                        type=int,
+                        default=None,
+                        help="Input prompt length for each request")
+    parser.add_argument("--output-len",
+                        type=int,
+                        default=None,
+                        help="Output length for each request. Overrides the "
+                        "output length from the dataset.")
+    parser.add_argument("--model", type=str, default="facebook/opt-125m")
+    parser.add_argument("--tokenizer", type=str, default=None)
+    parser.add_argument('--quantization',
+                        '-q',
+                        choices=[*QUANTIZATION_METHODS, None],
+                        default=None)
+    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
+    parser.add_argument("--n",
+                        type=int,
+                        default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument("--num-prompts",
+                        type=int,
+                        default=1000,
+                        help="Number of prompts to process.")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--hf-max-batch-size",
+                        type=int,
+                        default=None,
+                        help="Maximum batch size for HF backend.")
+    parser.add_argument('--trust-remote-code',
+                        action='store_true',
+                        help='trust remote code from huggingface')
+    parser.add_argument(
+        '--max-model-len',
+        type=int,
+        default=None,
+        help='Maximum length of a sequence (including prompt and output). '
+        'If None, will be derived from the model.')
+    parser.add_argument(
+        '--dtype',
+        type=str,
+        default='auto',
+        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
+        help='data type for model weights and activations. '
+        'The "auto" option will use FP16 precision '
+        'for FP32 and FP16 models, and BF16 precision '
+        'for BF16 models.')
+    parser.add_argument('--gpu-memory-utilization',
+                        type=float,
+                        default=0.9,
+                        help='the fraction of GPU memory to be used for '
+                        'the model executor, which can range from 0 to 1.'
+                        'If unspecified, will use the default value of 0.9.')
+    parser.add_argument("--enforce-eager",
+                        action="store_true",
+                        help="enforce eager execution")
+    parser.add_argument(
+        '--kv-cache-dtype',
+        type=str,
+        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
+        default="auto",
+        help='Data type for kv cache storage. If "auto", will use model '
+        'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
+        'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
+    parser.add_argument(
+        '--quantization-param-path',
+        type=str,
+        default=None,
+        help='Path to the JSON file containing the KV cache scaling factors. '
+        'This should generally be supplied, when KV cache dtype is FP8. '
+        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
+        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
+        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
+        'instead supported for common inference criteria.')
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        choices=["cuda", "cpu"],
+        help='device type for vLLM execution, supporting CUDA and CPU.')
+    parser.add_argument(
+        "--enable-prefix-caching",
+        action='store_true',
+        help="enable automatic prefix caching for vLLM backend.")
+    parser.add_argument("--enable-chunked-prefill",
+                        action='store_true',
+                        help="enable chunked prefill for vLLM backend.")
+    parser.add_argument('--max-num-batched-tokens',
+                        type=int,
+                        default=None,
+                        help='maximum number of batched tokens per '
+                        'iteration')
+    parser.add_argument('--download-dir',
+                        type=str,
+                        default=None,
+                        help='directory to download and load the weights, '
+                        'default to the default cache dir of huggingface')
+    parser.add_argument(
+        '--output-json',
+        type=str,
+        default=None,
+        help='Path to save the throughput results in JSON format.')
+    parser.add_argument(
+        '--distributed-executor-backend',
+        choices=['ray', 'mp'],
+        default=None,
+        help='Backend to use for distributed serving. When more than 1 GPU '
+        'is used, will be automatically set to "ray" if installed '
+        'or "mp" (multiprocessing) otherwise.')
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    if args.dataset is None:
+        assert args.input_len is not None
+        assert args.output_len is not None
+    else:
+        assert args.input_len is None
+
+    if args.backend == "vllm":
+        if args.hf_max_batch_size is not None:
+            raise ValueError("HF max batch size is only for HF backend.")
+    elif args.backend == "hf":
+        if args.hf_max_batch_size is None:
+            raise ValueError("HF max batch size is required for HF backend.")
+        if args.quantization is not None:
+            raise ValueError("Quantization is only for vLLM backend.")
+    elif args.backend == "mii":
+        if args.dtype != "auto":
+            raise ValueError("dtype must be auto for MII backend.")
+        if args.n != 1:
+            raise ValueError("n must be 1 for MII backend.")
+        if args.use_beam_search:
+            raise ValueError("Beam search is not supported for MII backend.")
+        if args.quantization is not None:
+            raise ValueError("Quantization is only for vLLM backend.")
+        if args.hf_max_batch_size is not None:
+            raise ValueError("HF max batch size is only for HF backend.")
+        if args.tokenizer != args.model:
+            raise ValueError("Tokenizer must be the same as the model for MII "
+                             "backend.")
+    main(args)
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -0,0 +1,352 @@
+import argparse
+import copy
+import itertools
+import pickle as pkl
+import time
+from typing import Callable, Iterable, List, Tuple
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+# helpers
+
+
+def to_fp8(tensor: torch.tensor) -> torch.tensor:
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.tensor) -> torch.tensor:
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
+                      k: int) -> Tuple[torch.tensor, torch.tensor]:
+
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    if dtype == torch.int8:
+        return to_int8(a), to_int8(b)
+    if dtype == torch.float8_e4m3fn:
+        return to_fp8(a), to_fp8(b)
+
+    raise ValueError("unsupported dtype")
+
+
+# impl
+
+
+def pytorch_i8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
+                    scale_b: torch.tensor,
+                    out_dtype: torch.dtype) -> torch.tensor:
+    return torch.mm(a, b)
+
+
+def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
+                     scale_b: torch.tensor,
+                     out_dtype: torch.dtype) -> torch.tensor:
+    return torch._scaled_mm(a,
+                            b,
+                            scale_a=scale_a,
+                            scale_b=scale_b,
+                            out_dtype=out_dtype)
+
+
+def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
+                                scale_a: torch.tensor, scale_b: torch.tensor,
+                                out_dtype: torch.dtype) -> torch.tensor:
+    return torch._scaled_mm(a,
+                            b,
+                            scale_a=scale_a,
+                            scale_b=scale_b,
+                            out_dtype=out_dtype,
+                            use_fast_accum=True)
+
+
+def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
+                 scale_b: torch.tensor,
+                 out_dtype: torch.dtype) -> torch.tensor:
+    return ops.cutlass_scaled_mm_dq(a,
+                                    b,
+                                    scale_a,
+                                    scale_b,
+                                    out_dtype=out_dtype)
+
+
+# bench
+def bench_fn(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
+             scale_b: torch.tensor, out_dtype: torch.dtype, label: str,
+             sub_label: str, fn: Callable, description: str) -> TMeasurement:
+
+    min_run_time = 1
+
+    globals = {
+        "a": a,
+        "b": b,
+        "scale_a": scale_a,
+        "scale_b": scale_b,
+        "out_dtype": out_dtype,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(a, b, scale_a, scale_b, out_dtype)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+
+def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+               sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.int8
+    a, b = make_rand_tensors(torch.int8, m, n, k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+
+    timers = []
+    # pytorch impl
+    timers.append(
+        bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
+                 b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
+                 torch.bfloat16, label, sub_label, pytorch_i8_impl,
+                 "pytorch_bf16_bf16_bf16_matmul-no-scales"))
+
+    # cutlass impl
+    timers.append(
+        bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
+                 torch.bfloat16, label, sub_label, cutlass_impl,
+                 "cutlass_i8_i8_bf16_scaled_mm"))
+
+    return timers
+
+
+def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.float8_e4m3fn
+    a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+
+    timers = []
+
+    # pytorch impl: bf16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
+                 pytorch_fp8_impl, "pytorch_fp8_fp8_bf16_scaled_mm"))
+
+    # pytorch impl: bf16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
+                 pytorch_fp8_impl_fast_accum,
+                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum"))
+
+    # pytorch impl: fp16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
+                 pytorch_fp8_impl, "pytorch_fp8_fp8_fp16_scaled_mm"))
+
+    # pytorch impl: fp16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
+                 pytorch_fp8_impl_fast_accum,
+                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum"))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
+                 torch.bfloat16, label, sub_label, cutlass_impl,
+                 "cutlass_fp8_fp8_bf16_scaled_mm"))
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
+                 torch.float16, label, sub_label, cutlass_impl,
+                 "cutlass_fp8_fp8_fp16_scaled_mm"))
+    return timers
+
+
+def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+          sub_label: str) -> Iterable[TMeasurement]:
+    if dtype == torch.int8:
+        return bench_int8(dtype, m, k, n, label, sub_label)
+    if dtype == torch.float8_e4m3fn:
+        return bench_fp8(dtype, m, k, n, label, sub_label)
+    raise ValueError("unsupported type")
+
+
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def run(dtype: torch.dtype,
+        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+
+    results = []
+    for m, k, n in MKNs:
+        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
+                       f"MKN=({m}x{k}x{n})")
+        print_timers(timers)
+        results.extend(timers)
+
+    return results
+
+
+# output makers
+def make_output(data: Iterable[TMeasurement],
+                MKNs: Iterable[Tuple[int, int, int]],
+                base_description: str,
+                timestamp=None):
+
+    print(f"== All Results {base_description} ====")
+    print_timers(data)
+
+    # pickle all the results
+    timestamp = int(time.time()) if timestamp is None else timestamp
+    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(data, f)
+
+
+# argparse runners
+
+
+def run_square_bench(args):
+    dim_sizes = list(
+        range(args.dim_start, args.dim_end + 1, args.dim_increment))
+    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"square_bench-{args.dtype}")
+
+
+def run_range_bench(args):
+    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
+    n = len(dim_sizes)
+    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
+    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
+    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
+    MKNs = list(zip(Ms, Ks, Ns))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"range_bench-{args.dtype}")
+
+
+def run_model_bench(args):
+
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+        KNs = []
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KNs.append(KN)
+        return KNs
+
+    model_bench_data = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        Ms = args.batch_sizes
+        KNs = model_shapes(model, tp_size)
+        MKNs = []
+        for m in Ms:
+            for k, n in KNs:
+                MKNs.append((m, k, n))
+
+        data = run(args.dtype, MKNs)
+        model_bench_data.append(data)
+
+    # Print all results
+    for data, model_tp in zip(model_bench_data, models_tps):
+        model, tp_size = model_tp
+        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+        print_timers(data)
+
+    timestamp = int(time.time())
+
+    all_data = []
+    for d in model_bench_data:
+        all_data.extend(d)
+    # pickle all data
+    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(all_data, f)
+
+
+if __name__ == '__main__':
+
+    def to_torch_dtype(dt):
+        if dt == "int8":
+            return torch.int8
+        if dt == "fp8":
+            return torch.float8_e4m3fn
+        raise ValueError("unsupported dtype")
+
+    parser = argparse.ArgumentParser(
+        description="""
+Benchmark Cutlass GEMM.
+
+    To run square GEMMs:
+        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
+    
+    To run constant N and K and sweep M:
+        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
+    
+    To run dimensions from a model:
+        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+    
+    Output:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument("--dtype",
+                        type=to_torch_dtype,
+                        required=True,
+                        help="Available options are ['int8', 'fp8']")
+    subparsers = parser.add_subparsers(dest="cmd")
+
+    square_parser = subparsers.add_parser("square_bench")
+    square_parser.add_argument("--dim-start", type=int, required=True)
+    square_parser.add_argument("--dim-end", type=int, required=True)
+    square_parser.add_argument("--dim-increment", type=int, required=True)
+    square_parser.set_defaults(func=run_square_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument("--dim-start", type=int, required=True)
+    range_parser.add_argument("--dim-end", type=int, required=True)
+    range_parser.add_argument("--dim-increment", type=int, required=True)
+    range_parser.add_argument("--m-constant", type=int, default=None)
+    range_parser.add_argument("--n-constant", type=int, default=None)
+    range_parser.add_argument("--k-constant", type=int, default=None)
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument("--models",
+                              nargs="+",
+                              type=str,
+                              default=DEFAULT_MODELS,
+                              choices=WEIGHT_SHAPES.keys())
+    model_parser.add_argument("--tp-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_TP_SIZES)
+    model_parser.add_argument("--batch-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_BATCH_SIZES)
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+    args.func(args)
--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@@ -0,0 +1,37 @@
+# Weight Shapes are in the format
+# ([K, N], TP_SPLIT_DIM)
+# Example:
+#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
+#   - TP1 : K = 14336, N = 4096
+#   - TP2 : K = 7168, N = 4096
+#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
+#   - TP1 : K = 4096, N = 6144
+#   - TP4 : K = 4096, N = 1536
+
+# TP1 shapes
+WEIGHT_SHAPES = {
+    "mistralai/Mistral-7B-v0.1": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-7b-hf": [
+        ([4096, 12288], 1),
+        ([4096, 4096], 0),
+        ([4096, 22016], 1),
+        ([11008, 4096], 0),
+    ],
+    "meta-llama/Llama-2-13b-hf": [
+        ([5120, 15360], 1),
+        ([5120, 5120], 0),
+        ([5120, 27648], 1),
+        ([13824, 5120], 0),
+    ],
+    "meta-llama/Llama-2-70b-hf": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+}
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -0,0 +1,302 @@
+import argparse
+import os
+import sys
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.aqlm import (
+    dequantize_weight, generic_dequantize_gemm, get_int_dtype,
+    optimized_dequantize_gemm)
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+
+def torch_mult(
+        input: torch.Tensor,  #  [..., in_features]
+        weights: torch.Tensor,
+        scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+) -> torch.Tensor:
+    output = F.linear(input, weights)
+    return output
+
+
+def dequant_out_scale(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    if bias is None:
+        output = F.linear(input, weights, bias)
+        orig_shape = output.shape
+        flattened_output = output.view(-1, output.size(-1))
+        f_scales = scales.view(-1, scales.shape[0])
+        b_scales = f_scales.expand(flattened_output.shape[0], -1)
+        flattened_output *= b_scales
+        return flattened_output.view(orig_shape)
+    else:
+        b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
+            -1, weights.shape[1])
+        weights *= b_scales
+        return F.linear(input, weights, bias)
+
+
+def dequant_weight_scale(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
+        -1, weights.shape[1])
+    weights *= b_scales
+    return F.linear(input, weights, bias)
+
+
+def dequant_no_scale(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    return F.linear(input, weights, bias)
+
+
+# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
+# the generic pytorch version.
+# Just visual comparison.
+def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
+
+    n = parts.sum().item()
+
+    device = torch.device('cuda:0')
+
+    code_range = (1 << bits) // 2
+    ingroups = 8
+
+    codes = torch.randint(-code_range,
+                          code_range,
+                          size=(n, k // ingroups, nbooks),
+                          dtype=get_int_dtype(bits),
+                          device=device)
+
+    codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
+                            dtype=torch.float16,
+                            device=device)
+
+    count = 0
+    for index in range(16):
+        for i in range(8):
+            for book in range(nbooks):
+                codebooks[book, index, 0, i] = count * (10**book)
+            count += 1
+
+    print("codes shape", codes.shape)
+
+    for i in range(16):
+        for book in range(nbooks):
+            codes[0, i, book] = i
+            codes[0, -i, book] = i
+
+    weights = dequantize_weight(codes, codebooks, None)
+    weights2 = ops.aqlm_dequant(codes, codebooks, parts)
+
+    print("weights shape:", weights.shape)
+    print("weights2 shape:", weights2.shape)
+
+    print("weights are:", weights)
+    print("weights2 are:", weights2)
+
+    print("first 128 weights are", weights[0, 0:128].to(torch.int32))
+    print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
+
+    print("last 128 weights are", weights[0, -128:])
+    print("last 128 weights2 are:", weights2[0, -128:])
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description="Benchmark aqlm performance.")
+
+    # Add arguments
+    parser.add_argument("--nbooks",
+                        type=int,
+                        default=1,
+                        help="Number of codebooks (default: 1)")
+    parser.add_argument("--bits",
+                        type=int,
+                        default=16,
+                        help="Number of bits per code element (default: 16)")
+    parser.add_argument(
+        "--test",
+        type=bool,
+        default=False,
+        help="Run the decompression/dequant tester rather than benchmarking "
+        "(default: False)")
+
+    # Parse the arguments
+    args = parser.parse_args()
+
+    # Extract values
+    nbooks = args.nbooks
+    bits = args.bits
+
+    if args.test:
+        dequant_test(4096, torch.tensor((4096, )), nbooks, bits)
+        return
+
+    # Otherwise, benchmark.
+    methods = [
+        ops.aqlm_gemm,
+        dequant_out_scale,
+        generic_dequantize_gemm,
+        optimized_dequantize_gemm,
+        dequant_weight_scale,
+        torch_mult,
+        dequant_no_scale,
+    ]
+
+    filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
+    print(f"writing benchmarks to file {filename}")
+    with open(filename, "w") as f:
+        sys.stdout = f
+
+        print('m | k | n | n parts', end='')
+        for method in methods:
+            print(f" | {method.__name__.replace('_', ' ')} (µs)", end='')
+        print('')
+
+        # These are reasonable prefill sizes.
+        ksandpartions = ((4096, (4096, 4096, 4096)), (4096, (4096, )),
+                         (4096, (11008, 11008)), (11008, (4096, )))
+
+        # reasonable ranges for m.
+        for m in [
+                1, 2, 4, 8, 10, 12, 14, 16, 24, 32, 48, 52, 56, 64, 96, 112,
+                128, 256, 512, 1024, 1536, 2048, 3072, 4096
+        ]:
+            print(f'{m}', file=sys.__stdout__)
+            for ksp in ksandpartions:
+                run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits,
+                         methods)
+
+        sys.stdout = sys.__stdout__
+
+
+def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
+             methods):
+
+    # I didn't see visible improvements from increasing these, but feel free :)
+    num_warmup_trials = 1
+    num_trials = 1
+
+    num_calls = 100
+
+    # warmup.
+    for method in methods:
+        for _ in range(num_warmup_trials):
+            run_timing(
+                num_calls=num_calls,
+                m=m,
+                k=k,
+                parts=parts,
+                nbooks=nbooks,
+                bits=bits,
+                method=method,
+            )
+
+    n = parts.sum().item()
+    print(f'{m} | {k} | {n} | {parts.tolist()}', end='')
+
+    for method in methods:
+        best_time_us = 1e20
+        for _ in range(num_trials):
+            kernel_dur_ms = run_timing(
+                num_calls=num_calls,
+                m=m,
+                k=k,
+                parts=parts,
+                nbooks=nbooks,
+                bits=bits,
+                method=method,
+            )
+
+            kernel_dur_us = 1000 * kernel_dur_ms
+
+            if kernel_dur_us < best_time_us:
+                best_time_us = kernel_dur_us
+
+        print(f' | {kernel_dur_us:.0f}', end='')
+
+    print('')
+
+
+def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
+               nbooks: int, bits: int, method) -> float:
+
+    n = parts.sum().item()
+
+    device = torch.device('cuda:0')
+
+    input = torch.randn((1, m, k), dtype=torch.float16, device=device)
+
+    code_range = (1 << bits) // 2
+    ingroups = 8
+
+    codes = torch.randint(-code_range,
+                          code_range,
+                          size=(n, k // ingroups, nbooks),
+                          dtype=get_int_dtype(bits),
+                          device=device)
+
+    codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
+                            dtype=torch.float16,
+                            device=device)
+
+    scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
+
+    # for comparison to just a pytorch mult.
+    weights = torch.randn((n, k), dtype=torch.float16, device=device)
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    start_event.record()
+
+    if method is torch_mult:
+        for i in range(num_calls):
+            torch_mult(input, weights, scales)
+    else:
+        for i in range(num_calls):
+            method(input, codes, codebooks, scales, parts, None)
+
+    end_event.record()
+    end_event.synchronize()
+
+    dur_ms = start_event.elapsed_time(end_event) / num_calls
+    return dur_ms
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -0,0 +1,233 @@
+import argparse
+
+import torch
+import torch.utils.benchmark as benchmark
+from benchmark_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
+    GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
+from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
+    GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
+    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    MarlinWorkspace, marlin_24_quantize, marlin_quantize)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    gptq_pack, quantize_weights, sort_weights)
+
+DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+
+ACT_ORDER_OPTS = [False, True]
+K_FULL_OPTS = [False, True]
+
+
+def bench_run(results, model, act_order, is_k_full, num_bits, group_size,
+              size_m, size_k, size_n):
+    label = "Quant Matmul"
+
+    sub_label = ("{}, act={} k_full={}, b={}, g={}, "
+                 "MKN=({}x{}x{})".format(model, act_order, is_k_full, num_bits,
+                                         group_size, size_m, size_k, size_n))
+
+    print(f"Testing: {sub_label}")
+
+    a = torch.randn(size_m, size_k).to(torch.half).cuda()
+    b = torch.rand(size_k, size_n).to(torch.half).cuda()
+
+    a_tmp = (torch.zeros(size_m, size_k).to(torch.half).cuda())
+
+    # Marlin quant
+    (
+        marlin_w_ref,
+        marlin_q_w,
+        marlin_s,
+        marlin_g_idx,
+        marlin_sort_indices,
+        marlin_rand_perm,
+    ) = marlin_quantize(b, num_bits, group_size, act_order)
+
+    # Marlin_24 quant
+    (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta,
+     marlin_24_s) = marlin_24_quantize(b, num_bits, group_size)
+
+    # GPTQ quant
+    (w_ref, q_w, s, g_idx,
+     rand_perm) = quantize_weights(b, num_bits, group_size, act_order)
+    q_w_gptq = gptq_pack(q_w, num_bits, size_k, size_n)
+
+    # For act_order, sort the "weights" and "g_idx"
+    # so that group ids are increasing
+    repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
+    if act_order:
+        (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
+
+    # Prepare
+    marlin_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
+                                       GPTQ_MARLIN_MAX_PARALLEL)
+
+    marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
+                                          GPTQ_MARLIN_24_MAX_PARALLEL)
+
+    globals = {
+        # Gen params
+        "num_bits": num_bits,
+        "group_size": group_size,
+        "size_m": size_m,
+        "size_n": size_n,
+        "size_k": size_k,
+        "a": a,
+        "a_tmp": a_tmp,
+        # Marlin params
+        "marlin_w_ref": marlin_w_ref,
+        "marlin_q_w": marlin_q_w,
+        "marlin_s": marlin_s,
+        "marlin_g_idx": marlin_g_idx,
+        "marlin_sort_indices": marlin_sort_indices,
+        "marlin_rand_perm": marlin_rand_perm,
+        "marlin_workspace": marlin_workspace,
+        "is_k_full": is_k_full,
+        # Marlin_24 params
+        "marlin_24_w_ref": marlin_24_w_ref,
+        "marlin_24_q_w_comp": marlin_24_q_w_comp,
+        "marlin_24_meta": marlin_24_meta,
+        "marlin_24_s": marlin_24_s,
+        "marlin_24_workspace": marlin_24_workspace,
+        # GPTQ params
+        "q_w_gptq": q_w_gptq,
+        "repack_sort_indices": repack_sort_indices,
+        # Kernels
+        "gptq_marlin_gemm": ops.gptq_marlin_gemm,
+        "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
+        "gptq_marlin_repack": ops.gptq_marlin_repack,
+    }
+
+    min_run_time = 1
+
+    # Warmup pytorch
+    for i in range(5):
+        torch.matmul(a, marlin_w_ref)
+
+    results.append(
+        benchmark.Timer(
+            stmt="torch.matmul(a, marlin_w_ref)",
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="pytorch_gemm",
+        ).blocked_autorange(min_run_time=min_run_time))
+
+    results.append(
+        benchmark.Timer(
+            stmt=
+            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="gptq_marlin_gemm",
+        ).blocked_autorange(min_run_time=min_run_time))
+
+    if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
+            and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES):
+        results.append(
+            benchmark.Timer(
+                stmt=
+                "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, num_bits, size_m, size_n, size_k)",  # noqa: E501
+                globals=globals,
+                label=label,
+                sub_label=sub_label,
+                description="gptq_marlin_24_gemm",
+            ).blocked_autorange(min_run_time=min_run_time))
+
+    results.append(
+        benchmark.Timer(
+            stmt=
+            "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, num_bits)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="gptq_marlin_repack",
+        ).blocked_autorange(min_run_time=min_run_time))
+
+
+def main(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    results = []
+
+    for model in args.models:
+        for layer in WEIGHT_SHAPES[model]:
+            size_k = layer[0]
+            size_n = layer[1]
+
+            if len(args.limit_k) > 0 and size_k not in args.limit_k:
+                continue
+
+            if len(args.limit_n) > 0 and size_n not in args.limit_n:
+                continue
+
+            for act_order in ACT_ORDER_OPTS:
+                if len(args.limit_act_order
+                       ) > 0 and act_order not in args.limit_act_order:
+                    continue
+
+                for is_k_full in K_FULL_OPTS:
+                    if len(args.limit_k_full
+                           ) > 0 and is_k_full not in args.limit_k_full:
+                        continue
+
+                    for num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
+                        if len(args.limit_num_bits
+                               ) > 0 and num_bits not in args.limit_num_bits:
+                            continue
+
+                        for group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
+                            if len(
+                                    args.limit_group_size
+                            ) > 0 and group_size not in args.limit_group_size:
+                                continue
+
+                            # For act_order, the group_size must be less than
+                            # size_k
+                            if act_order and (group_size == size_k
+                                              or group_size == -1):
+                                continue
+
+                            for size_m in args.batch_sizes:
+                                bench_run(results, model, act_order, is_k_full,
+                                          num_bits, group_size, size_m, size_k,
+                                          size_n)
+
+    compare = benchmark.Compare(results)
+    compare.print()
+
+
+# For quick benchmarking use:
+#   python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
+#
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Benchmark Marlin across specified models/shapes/batches")
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=DEFAULT_MODELS,
+        choices=WEIGHT_SHAPES.keys(),
+    )
+    parser.add_argument("--batch-sizes",
+                        nargs="+",
+                        type=int,
+                        default=DEFAULT_BATCH_SIZES)
+    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-group-size", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-num-bits", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-act-order", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-k-full", nargs="+", type=int, default=[])
+
+    args = parser.parse_args()
+    main(args)
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -0,0 +1,322 @@
+import argparse
+import time
+from datetime import datetime
+from typing import Any, Dict, List, Tuple
+
+import ray
+import torch
+import triton
+from ray.experimental.tqdm_ray import tqdm
+from transformers import AutoConfig
+
+from vllm.model_executor.layers.fused_moe.fused_moe import *
+
+
+def benchmark_config(
+    config: Dict[str, int],
+    num_tokens: int,
+    num_experts: int,
+    shard_intermediate_size: int,
+    hidden_size: int,
+    topk: int,
+    dtype: torch.dtype,
+    use_fp8: bool,
+    num_iters: int = 100,
+) -> float:
+    init_dtype = torch.float16 if use_fp8 else dtype
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    w1 = torch.randn(num_experts,
+                     shard_intermediate_size,
+                     hidden_size,
+                     dtype=init_dtype)
+    w2 = torch.randn(num_experts,
+                     hidden_size,
+                     shard_intermediate_size // 2,
+                     dtype=init_dtype)
+    gating_output = torch.randn(num_iters,
+                                num_tokens,
+                                num_experts,
+                                dtype=torch.float32)
+
+    w1_scale = None
+    w2_scale = None
+    a1_scale = None
+    a2_scale = None
+    if use_fp8:
+        w1_scale = torch.randn(num_experts, dtype=torch.float32)
+        w2_scale = torch.randn(num_experts, dtype=torch.float32)
+        a1_scale = torch.randn(1, dtype=torch.float32)
+        a2_scale = torch.randn(1, dtype=torch.float32)
+
+        w1 = w1.to(torch.float8_e4m3fn)
+        w2 = w2.to(torch.float8_e4m3fn)
+
+    input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32)
+
+    def prepare(i: int):
+        input_gating.copy_(gating_output[i])
+
+    def run():
+        fused_moe(
+            x,
+            w1,
+            w2,
+            input_gating,
+            topk,
+            renormalize=True,
+            inplace=True,
+            override_config=config,
+            use_fp8=use_fp8,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+        )
+
+    # JIT compilation & warmup
+    run()
+    torch.cuda.synchronize()
+
+    # Capture 10 invocations with CUDA graph
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        for _ in range(10):
+            run()
+    torch.cuda.synchronize()
+
+    # Warmup
+    for _ in range(5):
+        graph.replay()
+    torch.cuda.synchronize()
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    latencies = []
+    for i in range(num_iters):
+        prepare(i)
+        torch.cuda.synchronize()
+
+        start_event.record()
+        graph.replay()
+        end_event.record()
+        end_event.synchronize()
+        latencies.append(start_event.elapsed_time(end_event))
+    avg = sum(latencies) / (num_iters * 10) * 1000  # us
+    graph.reset()
+    return avg
+
+
+def get_configs_compute_bound() -> List[Dict[str, int]]:
+    # Reduced search space for faster tuning.
+    # TODO(woosuk): Increase the search space and use a performance model to
+    # prune the search space.
+    configs = []
+    for num_stages in [2, 3, 4, 5]:
+        for block_m in [16, 32, 64, 128, 256]:
+            for block_k in [64, 128, 256]:
+                for block_n in [32, 64, 128, 256]:
+                    for num_warps in [4, 8]:
+                        for group_size in [1, 16, 32, 64]:
+                            configs.append({
+                                "BLOCK_SIZE_M": block_m,
+                                "BLOCK_SIZE_N": block_n,
+                                "BLOCK_SIZE_K": block_k,
+                                "GROUP_SIZE_M": group_size,
+                                "num_warps": num_warps,
+                                "num_stages": num_stages,
+                            })
+    return configs
+
+
+@ray.remote(num_gpus=1)
+class BenchmarkWorker:
+
+    def __init__(self, seed: int) -> None:
+        torch.set_default_device("cuda")
+        torch.cuda.manual_seed_all(seed)
+        self.seed = seed
+
+    def benchmark(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        shard_intermediate_size: int,
+        hidden_size: int,
+        topk: int,
+        dtype: torch.dtype,
+        use_fp8: bool,
+    ) -> Tuple[Dict[str, int], float]:
+        torch.cuda.manual_seed_all(self.seed)
+
+        dtype_str = "float8" if use_fp8 else None
+        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
+        # is the intermediate size after silu_and_mul.
+        op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
+                                    dtype_str)
+        if op_config is None:
+            config = get_default_config(num_tokens, num_experts,
+                                        shard_intermediate_size, hidden_size,
+                                        topk, dtype_str)
+        else:
+            config = op_config[min(op_config.keys(),
+                                   key=lambda x: abs(x - num_tokens))]
+        kernel_time = benchmark_config(config, num_tokens, num_experts,
+                                       shard_intermediate_size, hidden_size,
+                                       topk, dtype, use_fp8)
+        return config, kernel_time
+
+    def tune(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        shard_intermediate_size: int,
+        hidden_size: int,
+        topk: int,
+        dtype: torch.dtype,
+        use_fp8: bool,
+        search_space: List[Dict[str, int]],
+    ) -> Dict[str, int]:
+        best_config = None
+        best_time = float("inf")
+        for config in tqdm(search_space):
+            try:
+                kernel_time = benchmark_config(config,
+                                               num_tokens,
+                                               num_experts,
+                                               shard_intermediate_size,
+                                               hidden_size,
+                                               topk,
+                                               dtype,
+                                               use_fp8,
+                                               num_iters=10)
+            except triton.runtime.autotuner.OutOfResources:
+                # Some configurations may be invalid and fail to compile.
+                continue
+
+            if kernel_time < best_time:
+                best_time = kernel_time
+                best_config = config
+        now = datetime.now()
+        print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
+        return best_config
+
+
+def sort_config(config: Dict[str, int]) -> Dict[str, int]:
+    return {
+        "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
+        "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
+        "BLOCK_SIZE_K": config["BLOCK_SIZE_K"],
+        "GROUP_SIZE_M": config["GROUP_SIZE_M"],
+        "num_warps": config["num_warps"],
+        "num_stages": config["num_stages"],
+    }
+
+
+def save_configs(
+    configs: Dict[int, Dict[str, int]],
+    num_experts: int,
+    shard_intermediate_size: int,
+    hidden_size: int,
+    topk: int,
+    dtype: torch.dtype,
+    use_fp8: bool,
+) -> None:
+    dtype_str = "float8" if use_fp8 else None
+    # NOTE(woosuk): The current naming convention uses w2.shape[2], which
+    # is the intermediate size after silu_and_mul.
+    filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
+                                    dtype_str)
+    print(f"Writing best config to {filename}...")
+    with open(filename, "w") as f:
+        json.dump(configs, f, indent=4)
+        f.write("\n")
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    config = AutoConfig.from_pretrained(args.model)
+    if config.architectures[0] == "DbrxForCausalLM":
+        E = config.ffn_config.moe_num_experts
+        topk = config.ffn_config.moe_top_k
+        intermediate_size = config.ffn_config.ffn_hidden_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    else:
+        # Default: Mixtral.
+        E = config.num_local_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+
+    hidden_size = config.hidden_size
+    dtype = config.torch_dtype
+    use_fp8 = args.dtype == "fp8"
+
+    if args.batch_size is None:
+        batch_sizes = [
+            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
+            2048, 3072, 4096
+        ]
+    else:
+        batch_sizes = [args.batch_size]
+
+    ray.init()
+    num_gpus = int(ray.available_resources()["GPU"])
+    workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
+
+    def _distribute(method: str, inputs: List[Any]) -> List[Any]:
+        outputs = []
+        worker_idx = 0
+        for input_args in inputs:
+            worker = workers[worker_idx]
+            worker_method = getattr(worker, method)
+            output = worker_method.remote(*input_args)
+            outputs.append(output)
+            worker_idx = (worker_idx + 1) % num_gpus
+        return ray.get(outputs)
+
+    if args.tune:
+        search_space = get_configs_compute_bound()
+        print(f"Start tuning over {len(search_space)} configurations...")
+
+        start = time.time()
+        configs = _distribute(
+            "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
+                      topk, dtype, use_fp8, search_space)
+                     for batch_size in batch_sizes])
+        best_configs = {
+            M: sort_config(config)
+            for M, config in zip(batch_sizes, configs)
+        }
+        save_configs(best_configs, E, shard_intermediate_size, hidden_size,
+                     topk, dtype, use_fp8)
+        end = time.time()
+        print(f"Tuning took {end - start:.2f} seconds")
+    else:
+        outputs = _distribute("benchmark",
+                              [(batch_size, E, shard_intermediate_size,
+                                hidden_size, topk, dtype, use_fp8)
+                               for batch_size in batch_sizes])
+
+        for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
+            print(f"Batch size: {batch_size}, config: {config}")
+            print(f"Kernel time: {kernel_time:.2f} us")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model",
+                        type=str,
+                        default="mistralai/Mixtral-8x7B-Instruct-v0.1")
+    parser.add_argument("--tp-size", "-tp", type=int, default=2)
+    parser.add_argument("--dtype",
+                        type=str,
+                        choices=["auto", "fp8"],
+                        default="auto")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--batch-size", type=int, required=False)
+    parser.add_argument("--tune", action="store_true")
+    args = parser.parse_args()
+
+    main(args)
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -0,0 +1,209 @@
+import argparse
+import random
+import time
+from typing import Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
+
+NUM_BLOCKS = 1024
+PARTITION_SIZE = 512
+
+
+@torch.inference_mode()
+def main(
+    version: str,
+    num_seqs: int,
+    seq_len: int,
+    num_query_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    use_alibi: bool,
+    block_size: int,
+    dtype: torch.dtype,
+    seed: int,
+    do_profile: bool,
+    device: str = "cuda",
+    kv_cache_dtype: Optional[str] = None,
+) -> None:
+    random.seed(seed)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+
+    scale = float(1.0 / (head_size**0.5))
+    query = torch.empty(num_seqs,
+                        num_query_heads,
+                        head_size,
+                        dtype=dtype,
+                        device=device)
+    query.uniform_(-scale, scale)
+
+    assert num_query_heads % num_kv_heads == 0
+    alibi_slopes = None
+    if use_alibi:
+        alibi_slopes = torch.randn(num_query_heads,
+                                   dtype=torch.float,
+                                   device=device)
+
+    seq_lens = [seq_len for _ in range(num_seqs)]
+    max_seq_len = max(seq_lens)
+    seq_lens = torch.tensor(seq_lens, dtype=torch.int, device=device)
+
+    # Create the block tables.
+    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
+    block_tables = []
+    for _ in range(num_seqs):
+        block_table = [
+            random.randint(0, NUM_BLOCKS - 1)
+            for _ in range(max_num_blocks_per_seq)
+        ]
+        block_tables.append(block_table)
+    block_tables = torch.tensor(block_tables, dtype=torch.int, device=device)
+
+    # Create the KV cache.
+    key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
+                                                            block_size,
+                                                            1,
+                                                            num_kv_heads,
+                                                            head_size,
+                                                            kv_cache_dtype,
+                                                            dtype,
+                                                            device=device)
+    key_cache, value_cache = key_caches[0], value_caches[0]
+
+    # Prepare for the paged attention kernel.
+    output = torch.empty_like(query)
+    if version == "v2":
+        num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
+        tmp_output = torch.empty(
+            size=(num_seqs, num_query_heads, num_partitions, head_size),
+            dtype=output.dtype,
+            device=output.device,
+        )
+        exp_sums = torch.empty(
+            size=(num_seqs, num_query_heads, num_partitions),
+            dtype=torch.float32,
+            device=output.device,
+        )
+        max_logits = torch.empty_like(exp_sums)
+
+    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
+        torch.cuda.synchronize()
+        if profile:
+            torch.cuda.cudart().cudaProfilerStart()
+        start_time = time.perf_counter()
+
+        # Using default kv_scale
+        kv_scale = 1.0
+
+        for _ in range(num_iters):
+            if version == "v1":
+                ops.paged_attention_v1(
+                    output,
+                    query,
+                    key_cache,
+                    value_cache,
+                    num_kv_heads,
+                    scale,
+                    block_tables,
+                    seq_lens,
+                    block_size,
+                    max_seq_len,
+                    alibi_slopes,
+                    kv_cache_dtype,
+                    kv_scale,
+                )
+            elif version == "v2":
+                ops.paged_attention_v2(
+                    output,
+                    exp_sums,
+                    max_logits,
+                    tmp_output,
+                    query,
+                    key_cache,
+                    value_cache,
+                    num_kv_heads,
+                    scale,
+                    block_tables,
+                    seq_lens,
+                    block_size,
+                    max_seq_len,
+                    alibi_slopes,
+                    kv_cache_dtype,
+                    kv_scale,
+                )
+            else:
+                raise ValueError(f"Invalid version: {version}")
+        torch.cuda.synchronize()
+
+        end_time = time.perf_counter()
+        if profile:
+            torch.cuda.cudart().cudaProfilerStart()
+        return (end_time - start_time) / num_iters
+
+    # Warmup.
+    print("Warming up...")
+    run_benchmark = run_cuda_benchmark
+    run_benchmark(num_iters=3, profile=False)
+
+    # Benchmark.
+    if do_profile:
+        latency = run_benchmark(num_iters=1, profile=True)
+    else:
+        latency = run_benchmark(num_iters=100, profile=False)
+    print(f"Kernel running time: {latency * 1000000:.3f} us")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="Benchmark the paged attention kernel.")
+    parser.add_argument("--version",
+                        type=str,
+                        choices=["v1", "v2"],
+                        default="v2")
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument("--seq_len", type=int, default=4096)
+    parser.add_argument("--num-query-heads", type=int, default=64)
+    parser.add_argument("--num-kv-heads", type=int, default=8)
+    parser.add_argument("--head-size",
+                        type=int,
+                        choices=[64, 80, 96, 112, 128, 192, 256],
+                        default=128)
+    parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
+    parser.add_argument("--use-alibi", action="store_true")
+    parser.add_argument("--dtype",
+                        type=str,
+                        choices=["half", "bfloat16", "float"],
+                        default="half")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--profile", action="store_true")
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        choices=["auto", "fp8", "fp8_e5m2", "fp8_e4m3"],
+        default="auto",
+        help="Data type for kv cache storage. If 'auto', will use model "
+        "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
+        "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)")
+    args = parser.parse_args()
+    print(args)
+
+    if args.num_query_heads % args.num_kv_heads != 0:
+        raise ValueError("num_query_heads must be divisible by num_kv_heads")
+    main(
+        version=args.version,
+        num_seqs=args.batch_size,
+        seq_len=args.seq_len,
+        num_query_heads=args.num_query_heads,
+        num_kv_heads=args.num_kv_heads,
+        head_size=args.head_size,
+        block_size=args.block_size,
+        use_alibi=args.use_alibi,
+        dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
+        seed=args.seed,
+        do_profile=args.profile,
+        kv_cache_dtype=args.kv_cache_dtype,
+    )
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -0,0 +1,121 @@
+import argparse
+from itertools import accumulate
+from typing import Optional
+
+import nvtx
+import torch
+
+from vllm.model_executor.layers.rotary_embedding import get_rope
+
+
+def benchmark_rope_kernels_multi_lora(
+    is_neox_style: bool,
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    head_size: int,
+    rotary_dim: Optional[int],
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    max_position: int = 8192,
+    base: int = 10000,
+) -> None:
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    if rotary_dim is None:
+        rotary_dim = head_size
+    # silulating serving 4 LoRAs
+    scaling_factors = [1, 2, 4, 8]
+    # batched RoPE can take multiple scaling factors
+    batched_rope = get_rope(head_size, rotary_dim, max_position, base,
+                            is_neox_style, {
+                                "type": "linear",
+                                "factor": tuple(scaling_factors)
+                            })
+    # non-batched RoPE takes only one scaling factor, we create multiple
+    # instances to simulate the same behavior
+    non_batched_ropes = []
+    for scaling_factor in scaling_factors:
+        non_batched_ropes.append(
+            get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
+                     {
+                         "type": "linear",
+                         "factor": (scaling_factor, )
+                     }))
+
+    positions = torch.randint(0, max_position, (batch_size, seq_len))
+    query = torch.randn(batch_size,
+                        seq_len,
+                        num_heads * head_size,
+                        dtype=dtype)
+    key = torch.randn_like(query)
+
+    # create query offsets for batched RoPE, we concat multiple kv cache
+    # together and each query needs to find the right kv cache of its type
+    offset_map = torch.tensor(
+        list(
+            accumulate([0] + [
+                max_position * scaling_factor * 2
+                for scaling_factor in scaling_factors[:-1]
+            ])))
+    query_types = torch.randint(0,
+                                len(scaling_factors), (batch_size, seq_len),
+                                device=device)
+    # map query types to offsets
+    query_offsets = offset_map[query_types]
+    # the kernel takes flattened offsets
+    flatten_offsets = query_offsets.flatten()
+
+    # batched queries of the same type together for non-batched RoPE
+    queries = [query[query_types == i] for i in range(len(scaling_factors))]
+    keys = [key[query_types == i] for i in range(len(scaling_factors))]
+    packed_qkr = zip(queries, keys, non_batched_ropes)
+    # synchronize before start timing
+    torch.cuda.synchronize()
+    with nvtx.annotate("non-batched", color="yellow"):
+        for q, k, r in packed_qkr:
+            r.forward(positions, q, k)
+    torch.cuda.synchronize()
+    with nvtx.annotate("batched", color="green"):
+        batched_rope.forward(positions, query, key, flatten_offsets)
+    torch.cuda.synchronize()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="Benchmark the rotary embedding kernels.")
+    parser.add_argument("--is-neox-style", type=bool, default=True)
+    parser.add_argument("--batch-size", type=int, default=16)
+    parser.add_argument("--seq-len", type=int, default=512)
+    parser.add_argument("--num-heads", type=int, default=8)
+    parser.add_argument("--head-size",
+                        type=int,
+                        choices=[64, 80, 96, 112, 128, 192, 256],
+                        default=128)
+    parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
+    parser.add_argument("--dtype",
+                        type=str,
+                        choices=["bfloat16", "float"],
+                        default="float")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--device",
+                        type=str,
+                        choices=["cuda:0", "cuda:1"],
+                        default="cuda:0")
+    args = parser.parse_args()
+    print(args)
+
+    benchmark_rope_kernels_multi_lora(
+        is_neox_style=args.is_neox_style,
+        batch_size=args.batch_size,
+        seq_len=args.seq_len,
+        num_heads=args.num_heads,
+        head_size=args.head_size,
+        rotary_dim=args.rotary_dim,
+        dtype=getattr(torch, args.dtype),
+        seed=args.seed,
+        device=args.device,
+    )
--- a/benchmarks/kernels/benchmark_shapes.py
+++ b/benchmarks/kernels/benchmark_shapes.py
@@ -0,0 +1,75 @@
+WEIGHT_SHAPES = {
+    "ideal": [[4 * 256 * 32, 256 * 32]],
+    "mistralai/Mistral-7B-v0.1/TP1": [
+        [4096, 6144],
+        [4096, 4096],
+        [4096, 28672],
+        [14336, 4096],
+    ],
+    "mistralai/Mistral-7B-v0.1/TP2": [
+        [4096, 3072],
+        [2048, 4096],
+        [4096, 14336],
+        [7168, 4096],
+    ],
+    "mistralai/Mistral-7B-v0.1/TP4": [
+        [4096, 1536],
+        [1024, 4096],
+        [4096, 7168],
+        [3584, 4096],
+    ],
+    "meta-llama/Llama-2-7b-hf/TP1": [
+        [4096, 12288],
+        [4096, 4096],
+        [4096, 22016],
+        [11008, 4096],
+    ],
+    "meta-llama/Llama-2-7b-hf/TP2": [
+        [4096, 6144],
+        [2048, 4096],
+        [4096, 11008],
+        [5504, 4096],
+    ],
+    "meta-llama/Llama-2-7b-hf/TP4": [
+        [4096, 3072],
+        [1024, 4096],
+        [4096, 5504],
+        [2752, 4096],
+    ],
+    "meta-llama/Llama-2-13b-hf/TP1": [
+        [5120, 15360],
+        [5120, 5120],
+        [5120, 27648],
+        [13824, 5120],
+    ],
+    "meta-llama/Llama-2-13b-hf/TP2": [
+        [5120, 7680],
+        [2560, 5120],
+        [5120, 13824],
+        [6912, 5120],
+    ],
+    "meta-llama/Llama-2-13b-hf/TP4": [
+        [5120, 3840],
+        [1280, 5120],
+        [5120, 6912],
+        [3456, 5120],
+    ],
+    "meta-llama/Llama-2-70b-hf/TP1": [
+        [8192, 10240],
+        [8192, 8192],
+        [8192, 57344],
+        [28672, 8192],
+    ],
+    "meta-llama/Llama-2-70b-hf/TP2": [
+        [8192, 5120],
+        [4096, 8192],
+        [8192, 28672],
+        [14336, 8192],
+    ],
+    "meta-llama/Llama-2-70b-hf/TP4": [
+        [8192, 2560],
+        [2048, 8192],
+        [8192, 14336],
+        [7168, 8192],
+    ],
+}
--- a/benchmarks/launch_tgi_server.sh
+++ b/benchmarks/launch_tgi_server.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+PORT=8000
+MODEL=$1
+TOKENS=$2
+
+docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \
+           -v $PWD/data:/data \
+           ghcr.io/huggingface/text-generation-inference:1.4.0 \
+           --model-id $MODEL \
+           --sharded false  \
+           --max-input-length 1024 \
+           --max-total-tokens 2048 \
+           --max-best-of 5 \
+           --max-concurrent-requests 5000 \
+           --max-batch-total-tokens $TOKENS
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@@ -0,0 +1,63 @@
+import argparse
+import cProfile
+import pstats
+
+from vllm import LLM, SamplingParams
+
+# A very long prompt, total number of tokens is about 15k.
+LONG_PROMPT = ["You are an expert in large language models, aren't you?"
+               ] * 1000
+LONG_PROMPT = ' '.join(LONG_PROMPT)
+
+
+def main(args):
+    llm = LLM(
+        model=args.model,
+        enforce_eager=True,
+        enable_prefix_caching=True,
+        tensor_parallel_size=args.tensor_parallel_size,
+        use_v2_block_manager=args.use_v2_block_manager,
+    )
+
+    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+    profiler = cProfile.Profile()
+
+    print("------warm up------")
+    for i in range(3):
+        output = llm.generate(LONG_PROMPT, sampling_params)
+        print(output[0].outputs[0].text)
+
+    print("------start generating------")
+    for i in range(3):
+        profiler.runctx('llm.generate(LONG_PROMPT, sampling_params)',
+                        globals(), locals())
+
+    # analyze the runtime of hashing function
+    stats = pstats.Stats(profiler)
+    stats.sort_stats('cumulative')
+    total_time = 0
+    total_calls = 0
+    for func in stats.stats:
+        if 'hash_of_block' in func[2]:
+            total_time = stats.stats[func][3]
+            total_calls = stats.stats[func][0]
+    percentage = (total_time / stats.total_tt) * 100
+    print(f"Hashing took {total_time:.2f} seconds,"
+          f"{percentage:.2f}% of the total runtime.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Benchmark the performance of hashing function in'
+        'automatic prefix caching.')
+    parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k')
+    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
+    parser.add_argument('--output-len', type=int, default=10)
+    parser.add_argument('--enable-prefix-caching',
+                        action='store_true',
+                        help='enable prefix caching')
+    parser.add_argument('--use-v2-block-manager',
+                        action='store_true',
+                        help='Use BlockSpaceMangerV2')
+    args = parser.parse_args()
+    main(args)
--- a/benchmarks/sonnet.txt
+++ b/benchmarks/sonnet.txt
@@ -0,0 +1,518 @@
+FROM fairest creatures we desire increase,
+That thereby beauty's rose might never die,
+But as the riper should by time decease,
+His tender heir might bear his memory:
+But thou, contracted to thine own bright eyes,
+Feed'st thy light'st flame with self-substantial fuel,
+Making a famine where abundance lies,
+Thyself thy foe, to thy sweet self too cruel.
+Thou that art now the world's fresh ornament
+And only herald to the gaudy spring,
+Within thine own bud buriest thy content
+And, tender churl, makest waste in niggarding.
+Pity the world, or else this glutton be,
+To eat the world's due, by the grave and thee.
+When forty winters shall beseige thy brow,
+And dig deep trenches in thy beauty's field,
+Thy youth's proud livery, so gazed on now,
+Will be a tatter'd weed, of small worth held:
+Then being ask'd where all thy beauty lies,
+Where all the treasure of thy lusty days,
+To say, within thine own deep-sunken eyes,
+Were an all-eating shame and thriftless praise.
+How much more praise deserved thy beauty's use,
+If thou couldst answer 'This fair child of mine
+Shall sum my count and make my old excuse,'
+Proving his beauty by succession thine!
+This were to be new made when thou art old,
+And see thy blood warm when thou feel'st it cold.
+Look in thy glass, and tell the face thou viewest
+Now is the time that face should form another;
+Whose fresh repair if now thou not renewest,
+Thou dost beguile the world, unbless some mother.
+For where is she so fair whose unear'd womb
+Disdains the tillage of thy husbandry?
+Or who is he so fond will be the tomb
+Of his self-love, to stop posterity?
+Thou art thy mother's glass, and she in thee
+Calls back the lovely April of her prime:
+So thou through windows of thine age shall see
+Despite of wrinkles this thy golden time.
+But if thou live, remember'd not to be,
+Die single, and thine image dies with thee.
+Unthrifty loveliness, why dost thou spend
+Upon thyself thy beauty's legacy?
+Nature's bequest gives nothing but doth lend,
+And being frank she lends to those are free.
+Then, beauteous niggard, why dost thou abuse
+The bounteous largess given thee to give?
+Profitless usurer, why dost thou use
+So great a sum of sums, yet canst not live?
+For having traffic with thyself alone,
+Thou of thyself thy sweet self dost deceive.
+Then how, when nature calls thee to be gone,
+What acceptable audit canst thou leave?
+Thy unused beauty must be tomb'd with thee,
+Which, used, lives th' executor to be.
+Those hours, that with gentle work did frame
+The lovely gaze where every eye doth dwell,
+Will play the tyrants to the very same
+And that unfair which fairly doth excel:
+For never-resting time leads summer on
+To hideous winter and confounds him there;
+Sap cheque'd with frost and lusty leaves quite gone,
+Beauty o'ersnow'd and bareness every where:
+Then, were not summer's distillation left,
+A liquid prisoner pent in walls of glass,
+Beauty's effect with beauty were bereft,
+Nor it nor no remembrance what it was:
+But flowers distill'd though they with winter meet,
+Leese but their show; their substance still lives sweet.
+Then let not winter's ragged hand deface
+In thee thy summer, ere thou be distill'd:
+Make sweet some vial; treasure thou some place
+With beauty's treasure, ere it be self-kill'd.
+That use is not forbidden usury,
+Which happies those that pay the willing loan;
+That's for thyself to breed another thee,
+Or ten times happier, be it ten for one;
+Ten times thyself were happier than thou art,
+If ten of thine ten times refigured thee:
+Then what could death do, if thou shouldst depart,
+Leaving thee living in posterity?
+Be not self-will'd, for thou art much too fair
+To be death's conquest and make worms thine heir.
+Lo! in the orient when the gracious light
+Lifts up his burning head, each under eye
+Doth homage to his new-appearing sight,
+Serving with looks his sacred majesty;
+And having climb'd the steep-up heavenly hill,
+Resembling strong youth in his middle age,
+yet mortal looks adore his beauty still,
+Attending on his golden pilgrimage;
+But when from highmost pitch, with weary car,
+Like feeble age, he reeleth from the day,
+The eyes, 'fore duteous, now converted are
+From his low tract and look another way:
+So thou, thyself out-going in thy noon,
+Unlook'd on diest, unless thou get a son.
+Music to hear, why hear'st thou music sadly?
+Sweets with sweets war not, joy delights in joy.
+Why lovest thou that which thou receivest not gladly,
+Or else receivest with pleasure thine annoy?
+If the true concord of well-tuned sounds,
+By unions married, do offend thine ear,
+They do but sweetly chide thee, who confounds
+In singleness the parts that thou shouldst bear.
+Mark how one string, sweet husband to another,
+Strikes each in each by mutual ordering,
+Resembling sire and child and happy mother
+Who all in one, one pleasing note do sing:
+Whose speechless song, being many, seeming one,
+Sings this to thee: 'thou single wilt prove none.'
+Is it for fear to wet a widow's eye
+That thou consumest thyself in single life?
+Ah! if thou issueless shalt hap to die.
+The world will wail thee, like a makeless wife;
+The world will be thy widow and still weep
+That thou no form of thee hast left behind,
+When every private widow well may keep
+By children's eyes her husband's shape in mind.
+Look, what an unthrift in the world doth spend
+Shifts but his place, for still the world enjoys it;
+But beauty's waste hath in the world an end,
+And kept unused, the user so destroys it.
+No love toward others in that bosom sits
+That on himself such murderous shame commits.
+For shame! deny that thou bear'st love to any,
+Who for thyself art so unprovident.
+Grant, if thou wilt, thou art beloved of many,
+But that thou none lovest is most evident;
+For thou art so possess'd with murderous hate
+That 'gainst thyself thou stick'st not to conspire.
+Seeking that beauteous roof to ruinate
+Which to repair should be thy chief desire.
+O, change thy thought, that I may change my mind!
+Shall hate be fairer lodged than gentle love?
+Be, as thy presence is, gracious and kind,
+Or to thyself at least kind-hearted prove:
+Make thee another self, for love of me,
+That beauty still may live in thine or thee.
+As fast as thou shalt wane, so fast thou growest
+In one of thine, from that which thou departest;
+And that fresh blood which youngly thou bestowest
+Thou mayst call thine when thou from youth convertest.
+Herein lives wisdom, beauty and increase:
+Without this, folly, age and cold decay:
+If all were minded so, the times should cease
+And threescore year would make the world away.
+Let those whom Nature hath not made for store,
+Harsh featureless and rude, barrenly perish:
+Look, whom she best endow'd she gave the more;
+Which bounteous gift thou shouldst in bounty cherish:
+She carved thee for her seal, and meant thereby
+Thou shouldst print more, not let that copy die.
+When I do count the clock that tells the time,
+And see the brave day sunk in hideous night;
+When I behold the violet past prime,
+And sable curls all silver'd o'er with white;
+When lofty trees I see barren of leaves
+Which erst from heat did canopy the herd,
+And summer's green all girded up in sheaves
+Borne on the bier with white and bristly beard,
+Then of thy beauty do I question make,
+That thou among the wastes of time must go,
+Since sweets and beauties do themselves forsake
+And die as fast as they see others grow;
+And nothing 'gainst Time's scythe can make defence
+Save breed, to brave him when he takes thee hence.
+O, that you were yourself! but, love, you are
+No longer yours than you yourself here live:
+Against this coming end you should prepare,
+And your sweet semblance to some other give.
+So should that beauty which you hold in lease
+Find no determination: then you were
+Yourself again after yourself's decease,
+When your sweet issue your sweet form should bear.
+Who lets so fair a house fall to decay,
+Which husbandry in honour might uphold
+Against the stormy gusts of winter's day
+And barren rage of death's eternal cold?
+O, none but unthrifts! Dear my love, you know
+You had a father: let your son say so.
+Not from the stars do I my judgment pluck;
+And yet methinks I have astronomy,
+But not to tell of good or evil luck,
+Of plagues, of dearths, or seasons' quality;
+Nor can I fortune to brief minutes tell,
+Pointing to each his thunder, rain and wind,
+Or say with princes if it shall go well,
+By oft predict that I in heaven find:
+But from thine eyes my knowledge I derive,
+And, constant stars, in them I read such art
+As truth and beauty shall together thrive,
+If from thyself to store thou wouldst convert;
+Or else of thee this I prognosticate:
+Thy end is truth's and beauty's doom and date.
+When I consider every thing that grows
+Holds in perfection but a little moment,
+That this huge stage presenteth nought but shows
+Whereon the stars in secret influence comment;
+When I perceive that men as plants increase,
+Cheered and cheque'd even by the self-same sky,
+Vaunt in their youthful sap, at height decrease,
+And wear their brave state out of memory;
+Then the conceit of this inconstant stay
+Sets you most rich in youth before my sight,
+Where wasteful Time debateth with Decay,
+To change your day of youth to sullied night;
+And all in war with Time for love of you,
+As he takes from you, I engraft you new.
+But wherefore do not you a mightier way
+Make war upon this bloody tyrant, Time?
+And fortify yourself in your decay
+With means more blessed than my barren rhyme?
+Now stand you on the top of happy hours,
+And many maiden gardens yet unset
+With virtuous wish would bear your living flowers,
+Much liker than your painted counterfeit:
+So should the lines of life that life repair,
+Which this, Time's pencil, or my pupil pen,
+Neither in inward worth nor outward fair,
+Can make you live yourself in eyes of men.
+To give away yourself keeps yourself still,
+And you must live, drawn by your own sweet skill.
+Who will believe my verse in time to come,
+If it were fill'd with your most high deserts?
+Though yet, heaven knows, it is but as a tomb
+Which hides your life and shows not half your parts.
+If I could write the beauty of your eyes
+And in fresh numbers number all your graces,
+The age to come would say 'This poet lies:
+Such heavenly touches ne'er touch'd earthly faces.'
+So should my papers yellow'd with their age
+Be scorn'd like old men of less truth than tongue,
+And your true rights be term'd a poet's rage
+And stretched metre of an antique song:
+But were some child of yours alive that time,
+You should live twice; in it and in my rhyme.
+Shall I compare thee to a summer's day?
+Thou art more lovely and more temperate:
+Rough winds do shake the darling buds of May,
+And summer's lease hath all too short a date:
+Sometime too hot the eye of heaven shines,
+And often is his gold complexion dimm'd;
+And every fair from fair sometime declines,
+By chance or nature's changing course untrimm'd;
+But thy eternal summer shall not fade
+Nor lose possession of that fair thou owest;
+Nor shall Death brag thou wander'st in his shade,
+When in eternal lines to time thou growest:
+So long as men can breathe or eyes can see,
+So long lives this and this gives life to thee.
+Devouring Time, blunt thou the lion's paws,
+And make the earth devour her own sweet brood;
+Pluck the keen teeth from the fierce tiger's jaws,
+And burn the long-lived phoenix in her blood;
+Make glad and sorry seasons as thou fleets,
+And do whate'er thou wilt, swift-footed Time,
+To the wide world and all her fading sweets;
+But I forbid thee one most heinous crime:
+O, carve not with thy hours my love's fair brow,
+Nor draw no lines there with thine antique pen;
+Him in thy course untainted do allow
+For beauty's pattern to succeeding men.
+Yet, do thy worst, old Time: despite thy wrong,
+My love shall in my verse ever live young.
+A woman's face with Nature's own hand painted
+Hast thou, the master-mistress of my passion;
+A woman's gentle heart, but not acquainted
+With shifting change, as is false women's fashion;
+An eye more bright than theirs, less false in rolling,
+Gilding the object whereupon it gazeth;
+A man in hue, all 'hues' in his controlling,
+Much steals men's eyes and women's souls amazeth.
+And for a woman wert thou first created;
+Till Nature, as she wrought thee, fell a-doting,
+And by addition me of thee defeated,
+By adding one thing to my purpose nothing.
+But since she prick'd thee out for women's pleasure,
+Mine be thy love and thy love's use their treasure.
+So is it not with me as with that Muse
+Stirr'd by a painted beauty to his verse,
+Who heaven itself for ornament doth use
+And every fair with his fair doth rehearse
+Making a couplement of proud compare,
+With sun and moon, with earth and sea's rich gems,
+With April's first-born flowers, and all things rare
+That heaven's air in this huge rondure hems.
+O' let me, true in love, but truly write,
+And then believe me, my love is as fair
+As any mother's child, though not so bright
+As those gold candles fix'd in heaven's air:
+Let them say more than like of hearsay well;
+I will not praise that purpose not to sell.
+My glass shall not persuade me I am old,
+So long as youth and thou are of one date;
+But when in thee time's furrows I behold,
+Then look I death my days should expiate.
+For all that beauty that doth cover thee
+Is but the seemly raiment of my heart,
+Which in thy breast doth live, as thine in me:
+How can I then be elder than thou art?
+O, therefore, love, be of thyself so wary
+As I, not for myself, but for thee will;
+Bearing thy heart, which I will keep so chary
+As tender nurse her babe from faring ill.
+Presume not on thy heart when mine is slain;
+Thou gavest me thine, not to give back again.
+As an unperfect actor on the stage
+Who with his fear is put besides his part,
+Or some fierce thing replete with too much rage,
+Whose strength's abundance weakens his own heart.
+So I, for fear of trust, forget to say
+The perfect ceremony of love's rite,
+And in mine own love's strength seem to decay,
+O'ercharged with burden of mine own love's might.
+O, let my books be then the eloquence
+And dumb presagers of my speaking breast,
+Who plead for love and look for recompense
+More than that tongue that more hath more express'd.
+O, learn to read what silent love hath writ:
+To hear with eyes belongs to love's fine wit.
+Mine eye hath play'd the painter and hath stell'd
+Thy beauty's form in table of my heart;
+My body is the frame wherein 'tis held,
+And perspective it is the painter's art.
+For through the painter must you see his skill,
+To find where your true image pictured lies;
+Which in my bosom's shop is hanging still,
+That hath his windows glazed with thine eyes.
+Now see what good turns eyes for eyes have done:
+Mine eyes have drawn thy shape, and thine for me
+Are windows to my breast, where-through the sun
+Delights to peep, to gaze therein on thee;
+Yet eyes this cunning want to grace their art;
+They draw but what they see, know not the heart.
+Let those who are in favour with their stars
+Of public honour and proud titles boast,
+Whilst I, whom fortune of such triumph bars,
+Unlook'd for joy in that I honour most.
+Great princes' favourites their fair leaves spread
+But as the marigold at the sun's eye,
+And in themselves their pride lies buried,
+For at a frown they in their glory die.
+The painful warrior famoused for fight,
+After a thousand victories once foil'd,
+Is from the book of honour razed quite,
+And all the rest forgot for which he toil'd:
+Then happy I, that love and am beloved
+Where I may not remove nor be removed.
+Lord of my love, to whom in vassalage
+Thy merit hath my duty strongly knit,
+To thee I send this written embassage,
+To witness duty, not to show my wit:
+Duty so great, which wit so poor as mine
+May make seem bare, in wanting words to show it,
+But that I hope some good conceit of thine
+In thy soul's thought, all naked, will bestow it;
+Till whatsoever star that guides my moving
+Points on me graciously with fair aspect
+And puts apparel on my tatter'd loving,
+To show me worthy of thy sweet respect:
+Then may I dare to boast how I do love thee;
+Till then not show my head where thou mayst prove me.
+Weary with toil, I haste me to my bed,
+The dear repose for limbs with travel tired;
+But then begins a journey in my head,
+To work my mind, when body's work's expired:
+For then my thoughts, from far where I abide,
+Intend a zealous pilgrimage to thee,
+And keep my drooping eyelids open wide,
+Looking on darkness which the blind do see
+Save that my soul's imaginary sight
+Presents thy shadow to my sightless view,
+Which, like a jewel hung in ghastly night,
+Makes black night beauteous and her old face new.
+Lo! thus, by day my limbs, by night my mind,
+For thee and for myself no quiet find.
+How can I then return in happy plight,
+That am debarr'd the benefit of rest?
+When day's oppression is not eased by night,
+But day by night, and night by day, oppress'd?
+And each, though enemies to either's reign,
+Do in consent shake hands to torture me;
+The one by toil, the other to complain
+How far I toil, still farther off from thee.
+I tell the day, to please them thou art bright
+And dost him grace when clouds do blot the heaven:
+So flatter I the swart-complexion'd night,
+When sparkling stars twire not thou gild'st the even.
+But day doth daily draw my sorrows longer
+And night doth nightly make grief's strength seem stronger.
+When, in disgrace with fortune and men's eyes,
+I all alone beweep my outcast state
+And trouble deal heaven with my bootless cries
+And look upon myself and curse my fate,
+Wishing me like to one more rich in hope,
+Featured like him, like him with friends possess'd,
+Desiring this man's art and that man's scope,
+With what I most enjoy contented least;
+Yet in these thoughts myself almost despising,
+Haply I think on thee, and then my state,
+Like to the lark at break of day arising
+From sullen earth, sings hymns at heaven's gate;
+For thy sweet love remember'd such wealth brings
+That then I scorn to change my state with kings.
+When to the sessions of sweet silent thought
+I summon up remembrance of things past,
+I sigh the lack of many a thing I sought,
+And with old woes new wail my dear time's waste:
+Then can I drown an eye, unused to flow,
+For precious friends hid in death's dateless night,
+And weep afresh love's long since cancell'd woe,
+And moan the expense of many a vanish'd sight:
+Then can I grieve at grievances foregone,
+And heavily from woe to woe tell o'er
+The sad account of fore-bemoaned moan,
+Which I new pay as if not paid before.
+But if the while I think on thee, dear friend,
+All losses are restored and sorrows end.
+Thy bosom is endeared with all hearts,
+Which I by lacking have supposed dead,
+And there reigns love and all love's loving parts,
+And all those friends which I thought buried.
+How many a holy and obsequious tear
+Hath dear religious love stol'n from mine eye
+As interest of the dead, which now appear
+But things removed that hidden in thee lie!
+Thou art the grave where buried love doth live,
+Hung with the trophies of my lovers gone,
+Who all their parts of me to thee did give;
+That due of many now is thine alone:
+Their images I loved I view in thee,
+And thou, all they, hast all the all of me.
+If thou survive my well-contented day,
+When that churl Death my bones with dust shall cover,
+And shalt by fortune once more re-survey
+These poor rude lines of thy deceased lover,
+Compare them with the bettering of the time,
+And though they be outstripp'd by every pen,
+Reserve them for my love, not for their rhyme,
+Exceeded by the height of happier men.
+O, then vouchsafe me but this loving thought:
+'Had my friend's Muse grown with this growing age,
+A dearer birth than this his love had brought,
+To march in ranks of better equipage:
+But since he died and poets better prove,
+Theirs for their style I'll read, his for his love.'
+Full many a glorious morning have I seen
+Flatter the mountain-tops with sovereign eye,
+Kissing with golden face the meadows green,
+Gilding pale streams with heavenly alchemy;
+Anon permit the basest clouds to ride
+With ugly rack on his celestial face,
+And from the forlorn world his visage hide,
+Stealing unseen to west with this disgrace:
+Even so my sun one early morn did shine
+With all triumphant splendor on my brow;
+But out, alack! he was but one hour mine;
+The region cloud hath mask'd him from me now.
+Yet him for this my love no whit disdaineth;
+Suns of the world may stain when heaven's sun staineth.
+Why didst thou promise such a beauteous day,
+And make me travel forth without my cloak,
+To let base clouds o'ertake me in my way,
+Hiding thy bravery in their rotten smoke?
+'Tis not enough that through the cloud thou break,
+To dry the rain on my storm-beaten face,
+For no man well of such a salve can speak
+That heals the wound and cures not the disgrace:
+Nor can thy shame give physic to my grief;
+Though thou repent, yet I have still the loss:
+The offender's sorrow lends but weak relief
+To him that bears the strong offence's cross.
+Ah! but those tears are pearl which thy love sheds,
+And they are rich and ransom all ill deeds.
+No more be grieved at that which thou hast done:
+Roses have thorns, and silver fountains mud;
+Clouds and eclipses stain both moon and sun,
+And loathsome canker lives in sweetest bud.
+All men make faults, and even I in this,
+Authorizing thy trespass with compare,
+Myself corrupting, salving thy amiss,
+Excusing thy sins more than thy sins are;
+For to thy sensual fault I bring in sense--
+Thy adverse party is thy advocate--
+And 'gainst myself a lawful plea commence:
+Such civil war is in my love and hate
+That I an accessary needs must be
+To that sweet thief which sourly robs from me.
+Let me confess that we two must be twain,
+Although our undivided loves are one:
+So shall those blots that do with me remain
+Without thy help by me be borne alone.
+In our two loves there is but one respect,
+Though in our lives a separable spite,
+Which though it alter not love's sole effect,
+Yet doth it steal sweet hours from love's delight.
+I may not evermore acknowledge thee,
+Lest my bewailed guilt should do thee shame,
+Nor thou with public kindness honour me,
+Unless thou take that honour from thy name:
+But do not so; I love thee in such sort
+As, thou being mine, mine is thy good report.
+As a decrepit father takes delight
+To see his active child do deeds of youth,
+So I, made lame by fortune's dearest spite,
+Take all my comfort of thy worth and truth.
+For whether beauty, birth, or wealth, or wit,
+Or any of these all, or all, or more,
+Entitled in thy parts do crowned sit,
+I make my love engrafted to this store:
+So then I am not lame, poor, nor despised,
+Whilst that this shadow doth such substance give
+That I in thy abundance am sufficed
+And by a part of all thy glory live.
+Look, what is best, that best I wish in thee:
+This wish I have; then ten times happy me!
--- a/cacheflow/http_frontend/fastapi_frontend.py
+++ b/cacheflow/http_frontend/fastapi_frontend.py
@@ -1,179 +0,0 @@
-import argparse
-import asyncio
-import time
-from typing import List, Dict
-import json
-
-import ray
-from transformers import AutoTokenizer
-from fastapi import FastAPI, Request
-from fastapi.responses import StreamingResponse
-import uvicorn
-
-from cacheflow.sampling_params import SamplingParams
-from cacheflow.sequence import Sequence, SequenceGroup
-from cacheflow.master.server import (Server, add_server_arguments,
-                                     initialize_ray_cluster)
-from cacheflow.worker.controller import DeviceID
-from cacheflow.utils import Counter, get_gpu_memory, get_cpu_memory
-
-TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds
-app = FastAPI()
-
-
-class FastAPIFrontend:
-    def __init__(
-        self,
-        model: str,
-        model_path: str,
-        pipeline_parallel_size: int,
-        tensor_parallel_size: int,
-        block_size: int,
-        dtype: str,
-        seed: int,
-        swap_space: int,
-        max_num_batched_tokens: int,
-        num_nodes: int,
-        num_devices_per_node: int,
-        distributed_init_method: str,
-        all_stage_devices: List[List[DeviceID]],
-    ):
-        self.block_size = block_size
-
-        self.tokenizer = AutoTokenizer.from_pretrained(model)
-        self.seq_group_counter = Counter()
-        self.seq_counter = Counter()
-        remote_server_class = ray.remote(num_cpus=0)(Server)
-        self.server = remote_server_class.remote(
-            model=model,
-            model_path=model_path,
-            use_dummy_weights=False,
-            pipeline_parallel_size=pipeline_parallel_size,
-            tensor_parallel_size=tensor_parallel_size,
-            block_size=block_size,
-            dtype=dtype,
-            seed=seed,
-            swap_space=swap_space,
-            max_num_batched_tokens=max_num_batched_tokens,
-            num_nodes=num_nodes,
-            num_devices_per_node=num_devices_per_node,
-            distributed_init_method=distributed_init_method,
-            all_stage_devices=all_stage_devices,
-            gpu_memory=get_gpu_memory(),
-            cpu_memory=get_cpu_memory(),
-        )
-
-        self.running_seq_groups: Dict[int, SequenceGroup] = {}
-        self.sequence_group_events: Dict[int, asyncio.Event] = {}
-        self.is_server_running = False
-
-    async def server_step(self):
-        self.is_server_running = True
-        updated_seq_groups = await self.server.step.remote()
-        self.is_server_running = False
-        # Notify the waiting coroutines that there new outputs ready.
-        for seq_group in updated_seq_groups:
-            group_id = seq_group.group_id
-            self.running_seq_groups[group_id] = seq_group
-            self.sequence_group_events[group_id].set()
-
-    async def generate(self, request_dict: Dict):
-        # Preprocess the request.
-        prompt = request_dict["prompt"]
-        sampling_params = SamplingParams.from_dict(request_dict)
-        sampling_params.stop_token_ids.add(self.tokenizer.eos_token_id)
-        token_ids = self.tokenizer.encode(prompt)
-        seqs: List[Sequence] = []
-        for _ in range(sampling_params.n):
-            seq_id = next(self.seq_counter)
-            seq = Sequence(seq_id, token_ids, block_size=self.block_size)
-            seqs.append(seq)
-
-        arrival_time = time.time()
-        group_id = next(self.seq_group_counter)
-        seq_group = SequenceGroup(group_id, seqs, arrival_time)
-        # Create an event to notify us that there is new output from the
-        # cacheflow server.
-        group_event = asyncio.Event()
-        self.running_seq_groups[group_id] = seq_group
-        self.sequence_group_events[group_id] = group_event
-        # Add the request into the cacheflow server's waiting queue.
-        await self.server.add_sequence_groups.remote([(seq_group, sampling_params)])
-        # The cacheflow server does not have a background loop that keeps
-        # processing incoming requests. Therefore, we need to keep kicking
-        # the server to process the requests.
-        while True:
-            # Kick the server if the server is not running.
-            if not self.is_server_running:
-                await self.server_step()
-            # Wait for new output. The group_event will be set in server_step
-            # when there is new output available for the sequence group.
-            # Added a timeout to prevent deadlock.
-            await asyncio.wait_for(group_event.wait(), timeout=TIMEOUT_TO_PREVENT_DEADLOCK)
-            # Reset the event to wait for the next output.
-            group_event.clear()
-            # Decode and return new outputs
-            seq_group = self.running_seq_groups[group_id]
-            all_outputs = []
-            for seq in seq_group.seqs:
-                token_ids = seq.get_token_ids()
-                output = self.tokenizer.decode(token_ids, skip_special_tokens=True)
-                all_outputs.append(output)
-            ret = {
-                "text": all_outputs,
-                "error": 0,
-            }
-            yield (json.dumps(ret) + "\0").encode("utf-8")
-
-            # Once finished, release the resources of the sequence group.
-            if seq_group.is_finished():
-                del self.running_seq_groups[group_id]
-                del self.sequence_group_events[group_id]
-                # Kick the server if the server is not running. This is to
-                # prevent that there are still requests in server's waiting
-                # queue to be executed.
-                if not self.is_server_running:
-                    await self.server_step()
-                break
-
-
-@app.post("/generate")
-async def generate_stream(request: Request):
-    request_dict = await request.json()
-    return StreamingResponse(frontend.generate(request_dict))
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--host", type=str, default="localhost")
-    parser.add_argument("--port", type=int, default=10002)
-    parser = add_server_arguments(parser)
-    args = parser.parse_args()
-
-    # TODO(zhuohan): Support pipeline parallelism.
-    assert args.pipeline_parallel_size == 1, (
-        'Pipeline parallelism is not supported yet.')
-
-    (num_nodes, num_devices_per_node, distributed_init_method,
-    all_stage_devices) = (
-        initialize_ray_cluster(
-            pipeline_parallel_size=args.pipeline_parallel_size,
-            tensor_parallel_size=args.tensor_parallel_size))
-
-    frontend = FastAPIFrontend(
-        model=args.model,
-        model_path=args.model_path,
-        pipeline_parallel_size=args.pipeline_parallel_size,
-        tensor_parallel_size=args.tensor_parallel_size,
-        block_size=args.block_size,
-        dtype=args.dtype,
-        seed=args.seed,
-        swap_space=args.swap_space,
-        max_num_batched_tokens=args.max_num_batched_tokens,
-        num_nodes=num_nodes,
-        num_devices_per_node=num_devices_per_node,
-        distributed_init_method=distributed_init_method,
-        all_stage_devices=all_stage_devices,
-    )
-
-    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
--- a/cacheflow/http_frontend/gradio_webserver.py
+++ b/cacheflow/http_frontend/gradio_webserver.py
@@ -1,43 +0,0 @@
-import argparse
-import json
-import time
-
-import gradio as gr
-import requests
-
-
-def http_bot(prompt):
-    headers = {"User-Agent": "Cacheflow Client"}
-    pload = {
-        "prompt": prompt,
-        "max_num_steps": 128,
-    }
-    response = requests.post(args.model_url, headers=headers, json=pload, stream=True)
-
-    for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
-        if chunk:
-            data = json.loads(chunk.decode("utf-8"))
-            output = data["text"][0]
-            yield output
-
-
-def build_demo():
-    with gr.Blocks() as demo:
-        gr.Markdown(
-            "# Cacheflow demo\n"
-        )
-        inputbox = gr.Textbox(label="Input", placeholder="Enter text and press ENTER")# .style(container=False)
-        outputbox = gr.Textbox(label="Output", placeholder="Generated result from the model")
-        inputbox.submit(http_bot, [inputbox], [outputbox])
-    return demo
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--host", type=str, default="localhost")
-    parser.add_argument("--port", type=int, default=10003)
-    parser.add_argument("--model-url", type=str, default="http://localhost:10002/generate")
-    args = parser.parse_args()
-
-    demo = build_demo()
-    demo.queue(concurrency_count=100).launch(server_name=args.host, server_port=args.port)
--- a/cacheflow/http_frontend/test_cli_client.py
+++ b/cacheflow/http_frontend/test_cli_client.py
@@ -1,23 +0,0 @@
-import requests
-import json
-
-def http_request():
-    prompt = "Ion Stoica is a"
-
-    headers = {"User-Agent": "Test Client"}
-    pload = {
-        "prompt": prompt,
-        "n": 4,
-        "use_beam_search": True,
-        "temperature": 0.0,
-    }
-    response = requests.post("http://localhost:10002/generate", headers=headers, json=pload, stream=True)
-
-    for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
-        if chunk:
-            data = json.loads(chunk.decode("utf-8"))
-            output = data["text"]
-            yield output
-
-for h in http_request():
-    print(h, flush=True)
--- a/cacheflow/master/block_manager.py
+++ b/cacheflow/master/block_manager.py
@@ -1,246 +0,0 @@
-from typing import Dict, List, Optional, Set, Tuple
-
-from cacheflow.block import PhysicalTokenBlock
-from cacheflow.sequence import Sequence
-from cacheflow.sequence import SequenceGroup
-from cacheflow.sequence import SequenceStatus
-from cacheflow.utils import Device
-
-
-class BlockAllocator:
-
-    def __init__(
-        self,
-        device: Device,
-        block_size: int,
-        num_blocks: int,
-    ) -> None:
-        self.device = device
-        self.block_size = block_size
-        self.num_blocks = num_blocks
-
-        # Initialize the free blocks.
-        # TODO(woosuk): Make this a priority queue.
-        self.free_blocks = [
-            PhysicalTokenBlock(device=device, block_number=i, block_size=block_size)
-            for i in range(num_blocks)
-        ]
-
-    def allocate(self) -> PhysicalTokenBlock:
-        if not self.free_blocks:
-            raise ValueError('Out of memory! '
-                             f'No more free blocks are available.')
-        block = self.free_blocks.pop()
-        block.ref_count = 1
-        return block
-
-    def free(self, block: PhysicalTokenBlock) -> None:
-        if block.ref_count == 0:
-            raise ValueError('Double free! '
-                             f'The block {block} is already freed.')
-        block.ref_count -= 1
-        if block.ref_count == 0:
-            self.free_blocks.append(block)
-
-    def get_num_free_blocks(self) -> int:
-        return len(self.free_blocks)
-
-
-# Mapping: logical block number -> physical block.
-BlockTable = List[PhysicalTokenBlock]
-
-
-class BlockSpaceManager:
-
-    def __init__(
-        self,
-        block_size: int,
-        num_gpu_blocks: int,
-        num_cpu_blocks: int,
-        watermark: float = 0.01,
-    ) -> None:
-        self.block_size = block_size
-        self.num_total_gpu_blocks = num_gpu_blocks
-        self.num_total_cpu_blocks = num_cpu_blocks
-        self.watermark = watermark
-        assert watermark >= 0.0
-
-        self.watermark_blocks = int(watermark * num_gpu_blocks)
-        self.gpu_allocator = BlockAllocator(Device.GPU, block_size, num_gpu_blocks)
-        self.cpu_allocator = BlockAllocator(Device.CPU, block_size, num_cpu_blocks)
-
-        # Mapping: seq_id -> BlockTable.
-        self.block_tables: Dict[int, BlockTable] = {}
-
-    def can_allocate(self, seq_group: SequenceGroup) -> bool:
-        # FIXME(woosuk): Here we assume that all sequences in the group share
-        # the same prompt. This may not be true for preempted sequences.
-        seq = seq_group.seqs[0]
-        num_required_blocks = len(seq.logical_token_blocks)
-        num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks()
-        # Use watermark to avoid frequent cache eviction.
-        return num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks
-
-    def allocate(self, seq_group: SequenceGroup) -> None:
-        # NOTE: Here we assume that all sequences in the group have the same prompt.
-        seq = seq_group.seqs[0]
-
-        # Allocate new physical token blocks that will store the prompt tokens.
-        block_table: BlockTable = []
-        for _ in range(len(seq.logical_token_blocks)):
-            block = self.gpu_allocator.allocate()
-            # Set the reference counts of the token blocks.
-            block.ref_count = seq_group.num_seqs()
-            block_table.append(block)
-
-        # Assign the block table for each sequence.
-        for seq in seq_group.seqs:
-            self.block_tables[seq.seq_id] = block_table.copy()
-
-    def can_append(self, seq_group: SequenceGroup) -> bool:
-        # Simple heuristic: If there is at least one free block
-        # for each sequence, we can append.
-        num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks()
-        num_seqs = seq_group.num_seqs(status=SequenceStatus.RUNNING)
-        return num_seqs <= num_free_gpu_blocks
-
-    def append(self, seq: Sequence) -> Optional[Tuple[int, int]]:
-        """Allocate a physical slot for the new token."""
-        logical_blocks = seq.logical_token_blocks
-        block_table = self.block_tables[seq.seq_id]
-
-        if len(block_table) < len(logical_blocks):
-            # The sequence has a new logical block.
-            # Allocate a new physical block.
-            block = self.gpu_allocator.allocate()
-            block_table.append(block)
-            return None
-
-        # We want to append the token to the last physical block.
-        last_block = block_table[-1]
-        assert last_block.device == Device.GPU
-        if last_block.ref_count == 1:
-            # Not shared with other sequences. Appendable.
-            return None
-        else:
-            # The last block is shared with other sequences.
-            # Copy on Write: Allocate a new block and copy the tokens.
-            new_block = self.gpu_allocator.allocate()
-            block_table[-1] = new_block
-            self.gpu_allocator.free(last_block)
-            return last_block.block_number, new_block.block_number
-
-    def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
-        # NOTE: fork does not allocate a new physical block.
-        # Thus, it is always safe from OOM.
-        src_block_table = self.block_tables[parent_seq.seq_id]
-        self.block_tables[child_seq.seq_id] = src_block_table.copy()
-        for block in src_block_table:
-            block.ref_count += 1
-
-    def _get_physical_blocks(self, seq_group: SequenceGroup) -> List[PhysicalTokenBlock]:
-        # NOTE: Here, we assume that the physical blocks are only shared by
-        # the sequences in the same group.
-        blocks: Set[PhysicalTokenBlock] = set()
-        for seq in seq_group.seqs:
-            if seq.status == SequenceStatus.FINISHED:
-                continue
-            block_table = self.block_tables[seq.seq_id]
-            for block in block_table:
-                blocks.add(block)
-        return list(blocks)
-
-    def can_swap_in(self, seq_group: SequenceGroup) -> bool:
-        blocks = self._get_physical_blocks(seq_group)
-        num_swapped_seqs = seq_group.num_seqs(status=SequenceStatus.SWAPPED)
-        num_free_blocks = self.gpu_allocator.get_num_free_blocks()
-        # NOTE: Conservatively, we assume that every sequence will allocate
-        # at least one free block right after the swap-in.
-        # NOTE: This should match the logic in can_append().
-        num_required_blocks = len(blocks) + num_swapped_seqs
-        return num_free_blocks - num_required_blocks >= self.watermark_blocks
-
-    def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]:
-        # CPU block -> GPU block.
-        mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
-        for seq in seq_group.seqs:
-            if seq.status == SequenceStatus.FINISHED:
-                continue
-            new_block_table: BlockTable = []
-            block_table = self.block_tables[seq.seq_id]
-
-            for cpu_block in block_table:
-                if cpu_block in mapping:
-                    gpu_block = mapping[cpu_block]
-                    gpu_block.ref_count += 1
-                else:
-                    gpu_block = self.gpu_allocator.allocate()
-                    mapping[cpu_block] = gpu_block
-                new_block_table.append(gpu_block)
-                # Free the CPU block swapped in to GPU.
-                self.cpu_allocator.free(cpu_block)
-            self.block_tables[seq.seq_id] = new_block_table
-
-        block_number_mapping = {
-            cpu_block.block_number: gpu_block.block_number
-            for cpu_block, gpu_block in mapping.items()
-        }
-        return block_number_mapping
-
-    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
-        blocks = self._get_physical_blocks(seq_group)
-        return len(blocks) <= self.cpu_allocator.get_num_free_blocks()
-
-    def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
-        # GPU block -> CPU block.
-        mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
-        for seq in seq_group.seqs:
-            if seq.status == SequenceStatus.FINISHED:
-                continue
-            new_block_table: BlockTable = []
-            block_table = self.block_tables[seq.seq_id]
-
-            for gpu_block in block_table:
-                if gpu_block in mapping:
-                    cpu_block = mapping[gpu_block]
-                    cpu_block.ref_count += 1
-                else:
-                    cpu_block = self.cpu_allocator.allocate()
-                    mapping[gpu_block] = cpu_block
-                new_block_table.append(cpu_block)
-                # Free the GPU block swapped out to CPU.
-                self.gpu_allocator.free(gpu_block)
-            self.block_tables[seq.seq_id] = new_block_table
-
-        block_number_mapping = {
-            gpu_block.block_number: cpu_block.block_number
-            for gpu_block, cpu_block in mapping.items()
-        }
-        return block_number_mapping
-
-    def _free_block_table(self, block_table: BlockTable) -> None:
-        for block in block_table:
-            if block.device == Device.GPU:
-                self.gpu_allocator.free(block)
-            else:
-                self.cpu_allocator.free(block)
-
-    def free(self, seq: Sequence) -> None:
-        block_table = self.block_tables[seq.seq_id]
-        self._free_block_table(block_table)
-        del self.block_tables[seq.seq_id]
-
-    def reset(self) -> None:
-        for block_table in self.block_tables.values():
-            self._free_block_table(block_table)
-        self.block_tables.clear()
-
-    def get_block_table(self, seq: Sequence) -> List[int]:
-        block_table = self.block_tables[seq.seq_id]
-        return [block.block_number for block in block_table]
-
-    def get_num_free_gpu_blocks(self) -> int:
-        return self.gpu_allocator.get_num_free_blocks()
-
-    def get_num_free_cpu_blocks(self) -> int:
-        return self.cpu_allocator.get_num_free_blocks()
--- a/cacheflow/master/scheduler.py
+++ b/cacheflow/master/scheduler.py
@@ -1,529 +0,0 @@
-import enum
-import os
-import pickle
-import time
-from typing import Any, Dict, List, Optional, Tuple
-
-from cacheflow.master.block_manager import BlockSpaceManager
-from cacheflow.master.policy import PolicyFactory
-from cacheflow.sampling_params import SamplingParams
-from cacheflow.sequence import Sequence
-from cacheflow.sequence import SequenceGroup
-from cacheflow.sequence import SequenceGroupInputs
-from cacheflow.sequence import SequenceOutputs
-from cacheflow.sequence import SequenceStatus
-
-
-class PreemptionMode(enum.Enum):
-    """Preemption modes.
-
-    1. Swapping: Swap out the blocks of the preempted sequences to CPU memory
-    and swap them back in when the sequences are resumed.
-    2. Recomputation: Discard the blocks of the preempted sequences and
-    recompute them when the sequences are resumed, treating the sequences as
-    new prompts.
-    """
-    SWAP = enum.auto()
-    RECOMPUTE = enum.auto()
-
-
-class Scheduler:
-
-    def __init__(
-        self,
-        controllers: List,
-        block_size: int,
-        num_gpu_blocks: int,
-        num_cpu_blocks: int,
-        max_num_batched_tokens: int,
-        max_num_sequences: int,
-        collect_stats: bool,
-        do_memory_analysis: bool = False,
-    ) -> None:
-        self.controllers = controllers
-        self.block_size = block_size
-        self.num_gpu_blocks = num_gpu_blocks
-        self.num_cpu_blocks = num_cpu_blocks
-        self.max_num_batched_tokens = max_num_batched_tokens
-        self.max_num_sequences = max_num_sequences
-        self.collect_stats = collect_stats
-        self.do_memory_analysis = do_memory_analysis
-
-        # Instantiate the scheduling policy.
-        self.policy = PolicyFactory.get_policy(policy_name='fcfs')
-        # Create the block space manager.
-        self.block_manager = BlockSpaceManager(
-            block_size=block_size,
-            num_gpu_blocks=num_gpu_blocks,
-            num_cpu_blocks=num_cpu_blocks,
-        )
-
-        # Sequence groups in the WAITING state.
-        self.waiting: List[SequenceGroup] = []
-        # Sequence groups in the RUNNING state.
-        self.running: List[SequenceGroup] = []
-        # Mapping: group_id -> num_steps.
-        self.num_steps: Dict[int, int] = {}
-        # Mapping: group_id -> sampling params.
-        self.sampling_params: Dict[int, SamplingParams] = {}
-        # Sequence groups in the SWAPPED state.
-        self.swapped: List[SequenceGroup] = []
-
-        # Performance-related statistics.
-        self.stats = Stats(num_gpu_blocks, num_cpu_blocks)
-
-    def add_sequence_groups(
-        self,
-        seq_groups: List[Tuple[SequenceGroup, SamplingParams]],
-    ) -> None:
-        # Add sequence groups to the waiting queue.
-        for seq_group, sampling_params in seq_groups:
-            self.waiting.append(seq_group)
-            self.sampling_params[seq_group.group_id] = sampling_params
-
-    def _schedule(
-        self,
-    ) -> Tuple[Dict[int, int], Dict[int, int], Dict[int, List[int]], List[int]]:
-        # Blocks that need to be swaped or copied before model execution.
-        blocks_to_swap_in: Dict[int, int] = {}
-        blocks_to_swap_out: Dict[int, int] = {}
-        blocks_to_copy: Dict[int, List[int]] = {}
-
-        # Fix the current time.
-        now = time.time()
-
-        # NOTE(woosuk): We prioritize the sequence groups in the RUNNING state
-        # in order to minimize the preemption overheads.
-        # Preemption happens only when there is no available slot to keep all
-        # the sequence groups in the RUNNING state.
-        # In this case, the policy is responsible for deciding which sequence
-        # groups to preempt.
-        self.running = self.policy.sort_by_priority(now, self.running)
-
-        # Reserve new token slots for the running sequence groups.
-        running: List[SequenceGroup] = []
-        preempted: List[SequenceGroup] = []
-        while self.running:
-            seq_group = self.running.pop(0)
-            while not self.block_manager.can_append(seq_group):
-                if self.running:
-                    # Preempt the lowest-priority sequence groups.
-                    victim_seq_group = self.running.pop(-1)
-                    self._preempt(victim_seq_group, blocks_to_swap_out)
-                    preempted.append(victim_seq_group)
-                else:
-                    # No other sequence groups can be preempted.
-                    # Preempt the current sequence group.
-                    self._preempt(seq_group, blocks_to_swap_out)
-                    preempted.append(seq_group)
-                    break
-            else:
-                # Append new slots to the sequence group.
-                self._append(seq_group, blocks_to_copy)
-                running.append(seq_group)
-        self.running = running
-
-        # Swap in the sequence groups in the SWAPPED state if possible.
-        self.swapped = self.policy.sort_by_priority(now, self.swapped)
-        # FCFS
-        while self.swapped and not blocks_to_swap_out:
-            seq_group = self.swapped[0]
-            # If the sequence group has been preempted in this step, stop.
-            if seq_group in preempted:
-                break
-            # If the sequence group cannot be swapped in, stop.
-            if not self.block_manager.can_swap_in(seq_group):
-                break
-
-            # The total number of sequences in the RUNNING state should not
-            # exceed the maximum number of sequences.
-            num_seqs = seq_group.num_seqs(status=SequenceStatus.SWAPPED)
-            if len(self.running) + num_seqs > self.max_num_sequences:
-                break
-
-            seq_group = self.swapped.pop(0)
-            self._swap_in(seq_group, blocks_to_swap_in)
-            self._append(seq_group, blocks_to_copy)
-            self.running.append(seq_group)
-
-        num_batched_tokens = sum(
-            seq_group.num_seqs(status=SequenceStatus.RUNNING)
-            for seq_group in self.running
-        )
-
-        # Join waiting sequences if possible.
-        prompt_group_ids: List[int] = []
-        # NOTE(woosuk): The sequence groups in the SWAPPED state are strictly
-        # prioritized over the sequence groups in the WAITING state.
-        # This is because we want to bound the amount of CPU memory taken by
-        # the swapped sequence groups.
-        if not self.swapped:
-            self.waiting = self.policy.sort_by_priority(now, self.waiting)
-            while self.waiting:
-                seq_group = self.waiting[0]
-                # If the sequence group has been preempted in this step, stop.
-                if seq_group in preempted:
-                    break
-                # If the sequence group cannot be allocated, stop.
-                if not self.block_manager.can_allocate(seq_group):
-                    break
-
-                # If the number of batched tokens exceeds the limit, stop.
-                num_prompt_tokens = seq_group.seqs[0].get_len()
-                if (num_batched_tokens + num_prompt_tokens
-                    > self.max_num_batched_tokens):
-                    break
-
-                # The total number of sequences in the RUNNING state should not
-                # exceed the maximum number of sequences.
-                num_seqs = seq_group.num_seqs(status=SequenceStatus.WAITING)
-                if len(self.running) + num_seqs > self.max_num_sequences:
-                    break
-
-                seq_group = self.waiting.pop(0)
-                self._allocate(seq_group)
-                self.running.append(seq_group)
-                num_batched_tokens += num_prompt_tokens
-                prompt_group_ids.append(seq_group.group_id)
-
-        if self.collect_stats:
-            if self.running or blocks_to_swap_in or blocks_to_swap_out:
-                self.stats.timestamps.append(now - self.stats.start_time)
-                self.stats.input_lens.append(num_batched_tokens)
-                self.stats.swap_out_lens.append(len(blocks_to_swap_out) * self.block_size)
-                self.stats.swap_in_lens.append(len(blocks_to_swap_in) * self.block_size)
-                self.stats.num_preemption.append(len(preempted))
-                self.stats.num_swapped.append(len(self.swapped))
-                self.stats.num_running.append(len(self.running))
-                self.stats.num_waiting.append(len(self.waiting))
-
-                num_free_gpu_blocks = self.block_manager.get_num_free_gpu_blocks()
-                num_used_gpu_blocks = self.num_gpu_blocks - num_free_gpu_blocks
-                self.stats.gpu_cache_usage.append(num_used_gpu_blocks / self.num_gpu_blocks)
-                num_free_cpu_blocks = self.block_manager.get_num_free_cpu_blocks()
-                num_used_cpu_blocks = self.num_cpu_blocks - num_free_cpu_blocks
-                self.stats.cpu_cache_usage.append(num_used_cpu_blocks / self.num_cpu_blocks)
-
-                if self.do_memory_analysis:
-                    block_tables = self.block_manager.block_tables
-                    num_logical_blocks = 0
-                    num_logical_tokens = 0
-                    num_physical_blocks = 0
-                    num_physical_tokens = 0
-                    physical_block_numbers = set()
-                    num_reserved_tokens = 0
-                    for seq_group in self.running:
-                        group_id = seq_group.group_id
-                        sampling_params = self.sampling_params[group_id]
-                        max_num_steps = sampling_params.max_num_steps
-                        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
-                            num_logical_blocks += len(seq.logical_token_blocks)
-                            num_logical_tokens += seq.get_len()
-
-                            seq_id = seq.seq_id
-                            block_table = block_tables[seq_id]
-                            for i, block in enumerate(block_table):
-                                if block.block_number in physical_block_numbers:
-                                    continue
-                                physical_block_numbers.add(block.block_number)
-                                num_physical_blocks += 1
-                                num_physical_tokens += seq.logical_token_blocks[i].num_tokens
-                    
-                    assert num_physical_blocks == num_used_gpu_blocks
-                    self.stats.num_logical_blocks.append(num_logical_blocks)
-                    self.stats.num_logical_tokens.append(num_logical_tokens)
-                    self.stats.num_physical_blocks.append(num_physical_blocks)
-                    self.stats.num_physical_tokens.append(num_physical_tokens)
-                    self.stats.num_reserved_tokens.append(num_reserved_tokens)
-
-        return (blocks_to_swap_in,
-                blocks_to_swap_out,
-                blocks_to_copy,
-                prompt_group_ids)
-
-    def step(self) -> List[SequenceGroup]:
-        # Schedule sequence groups.
-        # This function call changes the internal states of the scheduler
-        # such as self.running, self.swapped, and self.waiting.
-        scheduler_output = self._schedule()
-        blocks_to_swap_in = scheduler_output[0]
-        blocks_to_swap_out = scheduler_output[1]
-        blocks_to_copy = scheduler_output[2]
-        prompt_group_ids = scheduler_output[3]
-
-        # Create input data structures.
-        input_seq_groups: List[SequenceGroupInputs] = []
-        updated_seq_groups: List[SequenceGroup] = self.running.copy()
-
-        for seq_group in self.running:
-            group_id = seq_group.group_id
-            is_prompt = group_id in prompt_group_ids
-
-            input_tokens: Dict[int, List[int]] = {}
-            seq_logprobs: Dict[int, float] = {}
-            block_tables: Dict[int, List[int]] = {}
-            for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
-                seq_id = seq.seq_id
-                block_tables[seq_id] = self.block_manager.get_block_table(seq)
-                if is_prompt:
-                    input_tokens[seq_id] = seq.get_token_ids()
-                else:
-                    input_tokens[seq_id] = [seq.get_last_token_id()]
-                seq_logprobs[seq_id] = seq.cumulative_logprobs
-                # NOTE(woosuk): Sequences in the same group have the same
-                # sequence length
-                seq_len = seq.get_len()
-
-            input_seq_group = SequenceGroupInputs(
-                group_id=group_id,
-                is_prompt=is_prompt,
-                input_tokens=input_tokens,
-                context_len=seq_len,
-                seq_logprobs=seq_logprobs,
-                sampling_params=self.sampling_params[group_id],
-                block_tables=block_tables,
-            )
-            input_seq_groups.append(input_seq_group)
-
-        # Execute the first stage of the pipeline.
-        if input_seq_groups or blocks_to_swap_in or blocks_to_swap_out:
-            # Swap in and swap out should never happen at the same time.
-            assert not (blocks_to_swap_in and blocks_to_swap_out)
-            self.controllers[0].execute_stage(
-                input_seq_groups,
-                blocks_to_swap_in=blocks_to_swap_in,
-                blocks_to_swap_out=blocks_to_swap_out,
-                blocks_to_copy=blocks_to_copy,
-            )
-
-        return updated_seq_groups
-
-    def post_step(
-        self,
-        seq_outputs: Dict[int, SequenceOutputs],
-    ) -> None:
-        # Update the running sequences and free blocks.
-        for seq_group in self.running:
-            group_id = seq_group.group_id
-            self.num_steps[group_id] += 1
-            stop_token_ids = self.sampling_params[group_id].stop_token_ids
-
-            # Process beam search results before processing the next tokens.
-            for seq in seq_group.seqs:
-                if seq.status == SequenceStatus.FINISHED:
-                    continue
-
-                output = seq_outputs[seq.seq_id]
-                if seq.seq_id != output.parent_seq_id:
-                    # The sequence is a fork of the parent sequence (beam search).
-                    # Free the current sequence.
-                    self.block_manager.free(seq)
-                    # Fork the parent sequence.
-                    parent_seq = seq_group.find(output.parent_seq_id)
-                    parent_seq.fork(seq)
-                    self.block_manager.fork(parent_seq, seq)
-
-            # Process the next tokens.
-            for seq in seq_group.seqs:
-                if seq.status == SequenceStatus.FINISHED:
-                    continue
-
-                # Append a new token to the sequence.
-                output = seq_outputs[seq.seq_id]
-                seq.append(output.output_token, output.logprobs)
-
-                # Check if the sequence has generated a stop token.
-                if output.output_token in stop_token_ids:
-                    self._free_seq(seq)
-                    continue
-
-                # Check if the sequence has reached the maximum number of steps.
-                max_num_steps = self.sampling_params[group_id].max_num_steps
-                if self.num_steps[group_id] == max_num_steps:
-                    self._free_seq(seq)
-                    continue
-
-        # Update the running sequences.
-        running: List[SequenceGroup] = []
-        for seq_group in self.running:
-            if seq_group.is_finished():
-                self._free_seq_group(seq_group)
-            else:
-                running.append(seq_group)
-        self.running = running
-
-    def _allocate(self, seq_group: SequenceGroup) -> None:
-        self.block_manager.allocate(seq_group)
-        for seq in seq_group.seqs:
-            seq.status = SequenceStatus.RUNNING
-        # FIXME(woosuk): Support interactive generation.
-        if seq_group.group_id not in self.num_steps:
-            self.num_steps[seq_group.group_id] = 0
-
-    def _append(
-        self,
-        seq_group: SequenceGroup,
-        blocks_to_copy: Dict[int, List[int]],
-    ) -> None:
-        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
-            ret = self.block_manager.append(seq)
-            if ret is not None:
-                src_block, dst_block = ret
-                if src_block in blocks_to_copy:
-                    blocks_to_copy[src_block].append(dst_block)
-                else:
-                    blocks_to_copy[src_block] = [dst_block]
-
-    def _preempt(
-        self,
-        seq_group: SequenceGroup,
-        blocks_to_swap_out: Dict[int, int],
-        preemption_mode: Optional[PreemptionMode] = None,
-    ) -> None:
-        # If preemption mode is not specified, we determine the mode as follows:
-        # We use recomputation by default since it incurs lower overhead than
-        # swapping. However, when the sequence group has multiple sequences
-        # (e.g., beam search), recomputation is not supported. In such a case,
-        # we use swapping instead.
-        # FIXME(woosuk): This makes our scheduling policy a bit bizarre.
-        # As swapped sequences are prioritized over waiting sequences,
-        # sequence groups with multiple sequences are implicitly prioritized
-        # over sequence groups with a single sequence.
-        # TODO(woosuk): Support recomputation for sequence groups with multiple
-        # sequences. This may require a more sophisticated CUDA kernel.
-        if preemption_mode is None:
-            seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
-            if len(seqs) == 1:
-                preemption_mode = PreemptionMode.RECOMPUTE
-            else:
-                preemption_mode = PreemptionMode.SWAP
-        if preemption_mode == PreemptionMode.RECOMPUTE:
-            self._preempt_by_recompute(seq_group)
-        elif preemption_mode == PreemptionMode.SWAP:
-            self._preempt_by_swap(seq_group, blocks_to_swap_out)
-        else:
-            assert False, 'Invalid preemption mode.'
-
-    def _preempt_by_recompute(
-        self,
-        seq_group: SequenceGroup,
-    ) -> None:
-        seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
-        assert len(seqs) == 1
-        for seq in seqs:
-            seq.status = SequenceStatus.WAITING
-            self.block_manager.free(seq)
-        self.waiting.append(seq_group)
-
-    def _preempt_by_swap(
-        self,
-        seq_group: SequenceGroup,
-        blocks_to_swap_out: Dict[int, int],
-    ) -> None:
-        seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
-        for seq in seqs:
-            seq.status = SequenceStatus.SWAPPED
-        self._swap_out(seq_group, blocks_to_swap_out)
-        self.swapped.append(seq_group)
-
-    def _free_seq(self, seq: Sequence) -> None:
-        seq.status = SequenceStatus.FINISHED
-        self.block_manager.free(seq)
-
-    def _free_seq_group(self, seq_group: SequenceGroup) -> None:
-        group_id = seq_group.group_id
-        del self.num_steps[group_id]
-        del self.sampling_params[group_id]
-
-    def _swap_in(
-        self,
-        seq_group: SequenceGroup,
-        blocks_to_swap_in: Dict[int, int],
-    ) -> None:
-        mapping = self.block_manager.swap_in(seq_group)
-        blocks_to_swap_in.update(mapping)
-        for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
-            seq.status = SequenceStatus.RUNNING
-
-    def _swap_out(
-        self,
-        seq_group: SequenceGroup,
-        blocks_to_swap_out: Dict[int, int],
-    ) -> None:
-        assert self.block_manager.can_swap_out(seq_group)
-        mapping = self.block_manager.swap_out(seq_group)
-        blocks_to_swap_out.update(mapping)
-        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
-            seq.status = SequenceStatus.SWAPPED
-
-    def reset_stats(self) -> None:
-        self.stats.reset(self.num_gpu_blocks, self.num_cpu_blocks)
-
-    def save_stats(
-        self,
-        output_dir: str,
-    ) -> None:
-        assert self.collect_stats, 'Statistics collection is disabled.'
-        self.stats.save(output_dir)
-
-
-class Stats:
-
-    def __init__(
-        self,
-        num_gpu_blocks: int,
-        num_cpu_blocks: int,
-    ) -> None:
-        self.start_time: float = time.time()
-        self.num_gpu_blocks = num_gpu_blocks
-        self.num_cpu_blocks = num_cpu_blocks
-
-        self.timestamps: List[float] = []
-        self.input_lens: List[int] = []
-        self.swap_out_lens: List[int] = []
-        self.swap_in_lens: List[int] = []
-        self.num_preemption: List[int] = []
-        self.num_waiting: List[int] = []
-        self.num_running: List[int] = []
-        self.num_swapped: List[int] = []
-        self.gpu_cache_usage: List[float] = []
-        self.cpu_cache_usage: List[float] = []
-
-        self.num_logical_blocks: List[int] = []
-        self.num_logical_tokens: List[int] = []
-        self.num_physical_blocks: List[int] = []
-        self.num_physical_tokens: List[int] = []
-        self.num_reserved_tokens: List[int] = []
-
-    def reset(
-        self,
-        num_gpu_blocks: int,
-        num_cpu_blocks: int,
-    ) -> None:
-        self.__init__(num_gpu_blocks, num_cpu_blocks)
-
-    def to_dict(self) -> Dict[str, Any]:
-        return {
-            'start_time': self.start_time,
-            'num_gpu_blocks': self.num_gpu_blocks,
-            'num_cpu_blocks': self.num_cpu_blocks,
-            'timestamps': self.timestamps,
-            'input_lens': self.input_lens,
-            'swap_out_lens': self.swap_out_lens,
-            'swap_in_lens': self.swap_in_lens,
-            'num_preemption': self.num_preemption,
-            'num_waiting': self.num_waiting,
-            'num_running': self.num_running,
-            'num_swapped': self.num_swapped,
-            'gpu_cache_usage': self.gpu_cache_usage,
-            'cpu_cache_usage': self.cpu_cache_usage,
-            'num_logical_blocks': self.num_logical_blocks,
-            'num_logical_tokens': self.num_logical_tokens,
-            'num_physical_blocks': self.num_physical_blocks,
-            'num_physical_tokens': self.num_physical_tokens,
-            'num_reserved_tokens': self.num_reserved_tokens,
-        }
-
-    def save(self, output_dir: str) -> None:
-        with open(os.path.join(output_dir, 'stats.pkl'), 'wb') as f:
-            pickle.dump(self.to_dict(), f)
--- a/cacheflow/master/server.py
+++ b/cacheflow/master/server.py
@@ -1,192 +0,0 @@
-import argparse
-from typing import List, Tuple
-import random
-
-import ray
-
-from cacheflow.master.scheduler import Scheduler
-from cacheflow.models import get_memory_analyzer
-from cacheflow.worker.controller import Controller, DeviceID
-from cacheflow.sequence import SequenceGroup
-from cacheflow.sampling_params import SamplingParams
-
-
-class Server:
-    def __init__(
-        self,
-        model: str,
-        model_path: str,
-        use_dummy_weights: bool,
-        pipeline_parallel_size: int,
-        tensor_parallel_size: int,
-        block_size: int,
-        dtype: str,
-        seed: int,
-        swap_space: int,
-        max_num_batched_tokens: int,
-        max_num_sequences: int,
-        num_nodes: int,
-        num_devices_per_node: int,
-        distributed_init_method: str,
-        all_stage_devices: List[List[DeviceID]],
-        gpu_memory: int,
-        cpu_memory: int,
-        collect_stats: bool = False,
-        do_memory_analysis: bool = False,
-    ):
-        self.num_nodes = num_nodes
-        self.num_devices_per_node = num_devices_per_node
-        self.world_size = pipeline_parallel_size * tensor_parallel_size
-
-        self.memory_analyzer = get_memory_analyzer(
-            model_name=model,
-            block_size=block_size,
-            dtype=dtype,
-            gpu_memory=gpu_memory,
-            cpu_memory=cpu_memory,
-            tensor_parallel_size=tensor_parallel_size,
-        )
-        self.num_gpu_blocks = self.memory_analyzer.get_max_num_gpu_blocks(
-            max_num_batched_tokens=max_num_batched_tokens)
-        self.num_cpu_blocks = self.memory_analyzer.get_max_num_cpu_blocks(
-            swap_space=swap_space)
-        print(f'# GPU blocks: {self.num_gpu_blocks}, '
-              f'# CPU blocks: {self.num_cpu_blocks}')
-
-        # Create a controller for each pipeline stage.
-        self.controllers: List[Controller] = []
-        for i in range(pipeline_parallel_size):
-            controller = Controller(
-                stage_id=i,
-                stage_devices=all_stage_devices[i],
-                world_size=self.world_size,
-                pipeline_parallel_size=pipeline_parallel_size,
-                tensor_parallel_size=tensor_parallel_size,
-                distributed_init_method=distributed_init_method,
-                model_name=model,
-                block_size=block_size,
-                num_gpu_blocks=self.num_gpu_blocks,
-                num_cpu_blocks=self.num_cpu_blocks,
-                dtype=dtype,
-                seed=seed,
-                model_path=model_path,
-                use_dummy_weights=use_dummy_weights,
-                max_num_batched_tokens=max_num_batched_tokens,
-            )
-            self.controllers.append(controller)
-
-        # Create a scheduler.
-        self.scheduler = Scheduler(
-            controllers=self.controllers,
-            block_size=block_size,
-            num_gpu_blocks=self.num_gpu_blocks,
-            num_cpu_blocks=self.num_cpu_blocks,
-            max_num_batched_tokens=max_num_batched_tokens,
-            max_num_sequences=max_num_sequences,
-            collect_stats=collect_stats,
-            do_memory_analysis=do_memory_analysis,
-        )
-        # Connect the controllers.
-        for i in range(len(self.controllers) - 1):
-            self.controllers[i].set_next(self.controllers[i + 1])
-        self.controllers[-1].set_next(self.scheduler)
-
-    def add_sequence_groups(
-        self,
-        sequence_groups: List[Tuple[SequenceGroup, SamplingParams]]
-    ):
-        self.scheduler.add_sequence_groups(sequence_groups)
-
-    def step(self):
-        return self.scheduler.step()
-
-    def has_unfinished_requests(self):
-        return (self.scheduler.waiting or self.scheduler.running or
-                self.scheduler.swapped)
-
-
-def initialize_ray_cluster(
-    address: str = 'auto',
-    pipeline_parallel_size: int = 1,
-    tensor_parallel_size: int = 1,
-) -> Tuple[int, int, str, List[List[DeviceID]]]:
-    # Connect to a ray cluster.
-    ray.init(address=address)
-
-    # Assume we have a uniform cluster that each node has the same number of
-    # GPUs for now.
-    valid_node_resources = []
-    num_devices_per_node = None
-    for node in ray.nodes():
-        if (not node['Alive']) or node['Resources']['GPU'] <= 0:
-            continue
-        if num_devices_per_node is None:
-            num_devices_per_node = node['Resources']['GPU']
-        else:
-            assert num_devices_per_node == node['Resources']['GPU'], (
-                "The number of GPUs per node is not uniform.")
-        for key in node['Resources']:
-            if key.startswith('node:'):
-                valid_node_resources.append(key)
-
-    num_nodes = len(valid_node_resources)
-
-    assert (pipeline_parallel_size * tensor_parallel_size
-            <= num_nodes * num_devices_per_node), (
-                "The number of required GPUs exceeds the total number of "
-                "available GPUs.")
-    if tensor_parallel_size >= num_devices_per_node:
-        assert tensor_parallel_size % num_devices_per_node == 0, (
-            "The number of tensor parallelism is not divisible by the "
-            "number of GPUs per node.")
-    else:
-        assert num_devices_per_node % tensor_parallel_size == 0, (
-            "The number of GPUs per node is not divisible by the number "
-            "of tensor parallelism.")
-
-    # Assign GPUs to pipeline stages.
-    rank = 0
-    current_node_id = 0
-    current_device_id = 0
-    distributed_init_method = None
-    all_stage_devices = []
-
-    for i in range(pipeline_parallel_size):
-        stage_devices = []
-        for j in range(tensor_parallel_size):
-            node_resource = valid_node_resources[current_node_id]
-            stage_devices.append((rank, node_resource, current_device_id))
-            if distributed_init_method is None:
-                ip = node_resource.split("node:")[-1]
-                port = random.randint(10000, 20000)
-                distributed_init_method = f"tcp://{ip}:{port}"
-            rank += 1
-            current_device_id += 1
-            if current_device_id >= num_devices_per_node:
-                current_node_id += 1
-                current_device_id = 0
-        all_stage_devices.append(stage_devices)
-
-    return (num_nodes, num_devices_per_node, distributed_init_method,
-            all_stage_devices)
-
-
-def add_server_arguments(parser: argparse.ArgumentParser):
-    # Model arguments
-    parser.add_argument('--model', type=str, default='facebook/opt-125m', help='model name')
-    parser.add_argument('--model-path', type=str, default='~/.cacheflow/model_weights',
-                        help='model path to download and load the weights')
-    # Parallel arguments
-    parser.add_argument('--pipeline-parallel-size', '-pp', type=int, default=1, help='number of pipeline stages')
-    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1, help='number of tensor parallel replicas')
-    # KV cache arguments
-    parser.add_argument('--block-size', type=int, default=16, choices=[1, 2, 4, 8, 16, 32, 64, 128, 256], help='token block size')
-    # NOTE(woosuk): If FlashAttention is used, the float data type is not supported.
-    parser.add_argument('--dtype', type=str, default='half', choices=['half'], help='data type')
-    # TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
-    parser.add_argument('--seed', type=int, default=0, help='random seed')
-    parser.add_argument('--swap-space', type=int, default=20, help='CPU swap space size (GiB) per GPU')
-    parser.add_argument('--max-num-batched-tokens', type=int, default=2560, help='maximum number of batched tokens per iteration')
-    parser.add_argument('--max-num-sequences', type=int, default=256, help='maximum number of sequences per iteration')
-    parser.add_argument('--use-dummy-weights', action='store_true', help='use dummy values for model weights')
-    return parser
--- a/cacheflow/master/simple_frontend.py
+++ b/cacheflow/master/simple_frontend.py
@@ -1,69 +0,0 @@
-import time
-from typing import List, Optional, Set, Tuple
-
-from transformers import AutoTokenizer
-
-from cacheflow.sampling_params import SamplingParams
-from cacheflow.sequence import Sequence, SequenceGroup
-from cacheflow.utils import Counter
-
-
-class SimpleFrontend:
-
-    def __init__(
-        self,
-        model_name: str,
-        block_size: int,
-    ) -> None:
-        self.block_size = block_size
-
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-        self.seq_group_counter = Counter()
-        self.seq_counter = Counter()
-        self.inputs: List[Tuple[SequenceGroup, SamplingParams]] = []
-
-    def add_eos_token(self, sampling_params: SamplingParams) -> SamplingParams:
-        # Stop generation when we see an EOS token.
-        sampling_params.stop_token_ids.add(self.tokenizer.eos_token_id)
-        return sampling_params
-
-    def query(
-        self,
-        prompt: str,
-        sampling_params: SamplingParams,
-    ) -> None:
-        token_ids = self.tokenizer.encode(prompt)
-        self._add_query(token_ids, sampling_params)
-
-    def _add_query(
-        self,
-        token_ids: List[int],
-        sampling_params: SamplingParams,
-        arrival_time: Optional[float] = None,
-    ) -> None:
-        if arrival_time is None:
-            arrival_time = time.time()
-        seqs: List[Sequence] = []
-        for _ in range(sampling_params.n):
-            seq_id = next(self.seq_counter)
-            seq = Sequence(seq_id, token_ids, block_size=self.block_size)
-            seqs.append(seq)
-
-        group_id = next(self.seq_group_counter)
-        seq_group = SequenceGroup(group_id, seqs, arrival_time)
-        self.inputs.append((seq_group, sampling_params))
-
-    def get_inputs(self) -> List[Tuple[SequenceGroup, SamplingParams]]:
-        inputs = self.inputs
-        self.inputs = []
-        return inputs
-
-    def print_response(
-        self,
-        seq_group: SequenceGroup,
-    ) -> None:
-        for seq in seq_group.seqs:
-            token_ids = seq.get_token_ids()
-            output = self.tokenizer.decode(token_ids, skip_special_tokens=True)
-            output = output.strip()
-            print(f'Seq {seq.seq_id}: {output!r}')
--- a/cacheflow/models/init.py
+++ b/cacheflow/models/init.py
@@ -1,10 +0,0 @@
-from cacheflow.models.input_metadata import InputMetadata
-from cacheflow.models.model_utils import get_memory_analyzer
-from cacheflow.models.model_utils import get_model
-
-
-__all__ = [
-    'InputMetadata',
-    'get_memory_analyzer',
-    'get_model',
-]
--- a/cacheflow/models/activation.py
+++ b/cacheflow/models/activation.py
@@ -1,20 +0,0 @@
-import torch
-import torch.nn as nn
-
-from cacheflow import activation_ops
-
-
-class SiluAndMul(nn.Module):
-
-    def __init__(self):
-        super().__init__()
-
-    def forward(
-        self,
-        x: torch.Tensor,        # (num_tokens, 2 * d)
-    ) -> torch.Tensor:          # (num_tokens, d)
-        num_tokens = x.shape[0]
-        d = x.shape[1] // 2
-        out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device)
-        activation_ops.silu_and_mul(out, x)
-        return out
--- a/cacheflow/models/attention.py
+++ b/cacheflow/models/attention.py
@@ -1,207 +0,0 @@
-from typing import Optional
-
-from flash_attn.flash_attn_interface import _flash_attn_forward
-import torch
-import torch.nn as nn
-
-from cacheflow import attention_ops
-from cacheflow import cache_ops
-from cacheflow import pos_encoding_ops
-from cacheflow.models import InputMetadata
-
-
-class GPTCacheFlowAttention(nn.Module):
-
-    def __init__(self, scale: float) -> None:
-        super().__init__()
-        self.scale = float(scale)
-
-    def multi_query_kv_attention(
-        self,
-        output: torch.Tensor,                   # [num_prompt_tokens, num_heads, head_size]
-        query: torch.Tensor,                    # [num_prompt_tokens, num_heads, head_size]
-        key: torch.Tensor,                      # [num_prompt_tokens, num_heads, head_size]
-        value: torch.Tensor,                    # [num_prompt_tokens, num_heads, head_size]
-        cumulative_prompt_lens: torch.Tensor,   # [num_prompts + 1]
-        max_prompt_len: int,
-    ) -> None:
-        if query.dtype == torch.float:
-            raise ValueError('The float data type is not supported by '
-                             'FlashAttention. Use the half data type instead.')
-        head_size = query.shape[-1]
-        if head_size > 128:
-            raise ValueError('FlashAttention does not support head_size > 128.')
-
-        # Directly call FlashAttention's internal function to avoid allocating
-        # a new tensor for the output.
-        _flash_attn_forward(
-            query,
-            key,
-            value,
-            output,
-            cumulative_prompt_lens,
-            cumulative_prompt_lens,
-            max_prompt_len,
-            max_prompt_len,
-            dropout_p=0.0,
-            softmax_scale=self.scale,
-            causal=True,
-            return_softmax=False,
-        )
-
-    def single_query_cached_kv_attention(
-        self,
-        output: torch.Tensor,           # [num_generation_tokens, num_heads, head_size]
-        query: torch.Tensor,            # [num_generation_tokens, num_heads, head_size]
-        key_cache: torch.Tensor,        # [num_blocks, num_heads, head_size/x, block_size, x]
-        value_cache: torch.Tensor,      # [num_blocks, num_heads, head_size, block_size]
-        input_metadata: InputMetadata,
-    ) -> None:
-        head_size = value_cache.shape[2]
-        supported_head_sizes = [32, 64, 80, 96, 128, 160, 192, 256]
-        if head_size not in supported_head_sizes:
-            raise ValueError(f'head_size ({head_size}) is not supported by '
-                             'the single_query_cached_kv_attention kernel. '
-                             'Use one of the following head sizes: '
-                             f'{supported_head_sizes}.')
-
-        block_size = value_cache.shape[3]
-        attention_ops.single_query_cached_kv_attention(
-            output,
-            query,
-            key_cache,
-            value_cache,
-            self.scale,
-            input_metadata.block_tables,
-            input_metadata.context_lens,
-            block_size,
-            input_metadata.max_context_len,
-        )
-
-    def forward(
-        self,
-        query: torch.Tensor,                    # [num_tokens, num_heads * head_size]
-        key: torch.Tensor,                      # [num_tokens, num_heads * head_size]
-        value: torch.Tensor,                    # [num_tokens, num_heads * head_size]
-        key_cache: torch.Tensor,                # [num_blocks, num_heads, head_size/x, block_size, x]
-        value_cache: torch.Tensor,              # [num_blocks, num_heads, head_size, block_size]
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:                          # [num_tokens, num_heads * head_size]
-        # NOTE: The query, key, and value tensors must be sliced from a qkv
-        # tensor of shape [num_tokens, 3 * num_heads * head_size].
-
-        # Reshape the query, key, and value tensors.
-        num_heads = value_cache.shape[1]
-        head_size = value_cache.shape[2]
-        query = query.view(-1, num_heads, head_size)
-        key = key.view(-1, num_heads, head_size)
-        value = value.view(-1, num_heads, head_size)
-
-        # Pre-allocate the output tensor.
-        output = torch.empty_like(query)
-
-        # Compute the attention op for prompts.
-        num_prompt_tokens = input_metadata.num_prompt_tokens
-        if num_prompt_tokens > 0:
-            self.multi_query_kv_attention(
-                output[:num_prompt_tokens],
-                query[:num_prompt_tokens],
-                key[:num_prompt_tokens],
-                value[:num_prompt_tokens],
-                input_metadata.cumulative_prompt_lens,
-                input_metadata.max_prompt_len,
-            )
-
-        # Wait until the cache op is done.
-        if cache_event is not None:
-            cache_event.wait()
-
-        # Reshape the keys and values and store them in the cache.
-        num_valid_tokens = input_metadata.num_valid_tokens
-        if num_valid_tokens > 0:
-            # The stride is 3 because the key and value are sliced from qkv.
-            cache_ops.reshape_and_cache(
-                key[:num_valid_tokens],
-                value[:num_valid_tokens],
-                key_cache,
-                value_cache,
-                input_metadata.slot_mapping,
-            )
-
-        if input_metadata.num_generation_tokens > 0:
-            # Compute the attention op for generation tokens.
-            self.single_query_cached_kv_attention(
-                output[num_prompt_tokens:num_valid_tokens],
-                query[num_prompt_tokens:num_valid_tokens],
-                key_cache,
-                value_cache,
-                input_metadata)
-
-        # Reshape the output tensor.
-        # NOTE(woosuk): The output tensor may include paddings.
-        return output.view(-1, num_heads * head_size)
-
-
-class OPTCacheFlowAttention(GPTCacheFlowAttention):
-    """OPT uses the same attention mechanism as GPT."""
-
-    def __init__(self, scale: float) -> None:
-        super().__init__(scale)
-
-
-class LlamaCacheFlowAttention(GPTCacheFlowAttention):
-    """Llama uses GPT-NeoX style rotary embedding."""
-
-    def __init__(
-        self,
-        scale: float,
-        head_size: int,
-        max_position: int = 8192,
-        base: int = 10000,
-    ) -> None:
-        super().__init__(scale)
-
-        # Create the cos and sin cache.
-        inv_freq = 1.0 / (base ** (torch.arange(0, head_size, 2) / head_size))
-        t = torch.arange(max_position).float()
-        freqs = torch.einsum('i,j -> ij', t, inv_freq.float())
-        cos = freqs.cos()
-        sin = freqs.sin()
-        cache = torch.cat((cos, sin), dim=-1)
-
-        # FIXME(woosuk): This assumes that we configure the default dtype when
-        # initializing the model. Make it more robust.
-        torch_dtype = torch.get_default_dtype()
-        cache = cache.to(torch_dtype)
-        # Embedding size: [max_position, head_size]
-        self.register_buffer('cos_sin_cache', cache, persistent=False)
-
-    def forward(
-        self,
-        positions: torch.LongTensor,            # [num_tokens]
-        query: torch.Tensor,                    # [num_tokens, num_heads * head_size]
-        key: torch.Tensor,                      # [num_tokens, num_heads * head_size]
-        value: torch.Tensor,                    # [num_tokens, num_heads * head_size]
-        key_cache: torch.Tensor,                # [num_blocks, num_heads, head_size/x, block_size, x]
-        value_cache: torch.Tensor,              # [num_blocks, num_heads, head_size, block_size]
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:                          # [num_tokens, num_heads * head_size]
-        # Apply rotary embedding to the query and key before passing them
-        # to the attention op.
-        pos_encoding_ops.rotary_embedding_neox(
-            positions,
-            query,
-            key,
-            self.cos_sin_cache,
-        )
-        return super().forward(
-            query,
-            key,
-            value,
-            key_cache,
-            value_cache,
-            input_metadata,
-            cache_event,
-        )
--- a/cacheflow/models/input_metadata.py
+++ b/cacheflow/models/input_metadata.py
@@ -1,55 +0,0 @@
-from typing import List, Dict, Tuple
-
-import torch
-
-from cacheflow.sampling_params import SamplingParams
-
-
-class InputMetadata:
-
-    def __init__(
-        self,
-        seq_groups: List[Tuple[List[int], SamplingParams]],
-        seq_logprobs: Dict[int, float],                         # Seq id -> cumulative logprobs.
-        prompt_lens: List[int],
-        cumulative_prompt_lens: torch.Tensor,
-        slot_mapping: torch.Tensor,
-        context_lens: torch.Tensor,
-        max_context_len: int,
-        block_tables: torch.Tensor,
-    ) -> None:
-        self.seq_groups = seq_groups
-        self.seq_logprobs = seq_logprobs
-        self.prompt_lens = prompt_lens
-        self.cumulative_prompt_lens = cumulative_prompt_lens
-        self.slot_mapping = slot_mapping
-        self.context_lens = context_lens
-        self.max_context_len = max_context_len
-        self.block_tables = block_tables
-
-        self.num_prompts = len(prompt_lens)
-        self.num_prompt_tokens = sum(prompt_lens)
-        self.max_prompt_len = max(prompt_lens) if prompt_lens else 0
-        self.num_generation_tokens = context_lens.shape[0]
-        self.num_valid_tokens = slot_mapping.shape[0]
-        if block_tables.numel() > 0:
-            self.max_num_blocks_per_seq = block_tables.shape[1]
-        else:
-            self.max_num_blocks_per_seq = 0
-        assert block_tables.shape[0] == self.num_generation_tokens
-        assert context_lens.shape[0] == self.num_generation_tokens
-
-    def __repr__(self) -> str:
-        return (f'InputMetadata('
-                f'num_prompts={self.num_prompts}, '
-                f'num_prompt_tokens={self.num_prompt_tokens}, '
-                f'max_prompt_len={self.max_prompt_len}, '
-                f'num_generation_tokens={self.num_generation_tokens}, '
-                f'num_valid_tokens={self.num_valid_tokens}, '
-                f'max_num_blocks_per_seq={self.max_num_blocks_per_seq}, '
-                f'max_context_len={self.max_context_len}), '
-                f'prompt_lens={self.prompt_lens}, '
-                f'cumulative_prompt_lens={self.cumulative_prompt_lens}, '
-                f'slot_mapping={self.slot_mapping}, '
-                f'context_lens={self.context_lens}, '
-                f'block_tables={self.block_tables})')
--- a/cacheflow/models/layernorm.py
+++ b/cacheflow/models/layernorm.py
@@ -1,26 +0,0 @@
-import torch
-import torch.nn as nn
-
-from cacheflow import layernorm_ops
-
-
-class RMSNorm(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        eps: float = 1e-6,
-    ) -> None:
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        out = torch.empty_like(x)
-        layernorm_ops.rms_norm(
-            out,
-            x,
-            self.weight.data,
-            self.variance_epsilon,
-        )
-        return out
--- a/cacheflow/models/llama.py
+++ b/cacheflow/models/llama.py
@@ -1,292 +0,0 @@
-"""1D LLaMA model compatible with HuggingFace weights."""
-import os
-import glob
-import filelock
-from tqdm import tqdm
-from typing import Dict, List, Optional, Tuple
-
-import numpy as np
-import torch
-from torch import nn
-from transformers import LlamaConfig
-
-from cacheflow.models import InputMetadata
-from cacheflow.models.activation import SiluAndMul
-from cacheflow.models.attention import LlamaCacheFlowAttention
-from cacheflow.models.layernorm import RMSNorm
-from cacheflow.models.sample import Sampler
-from cacheflow.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
-from cacheflow.parallel_utils.tensor_parallel import (VocabParallelEmbedding,
-                                                      ColumnParallelLinear,
-                                                      RowParallelLinear)
-from cacheflow.sequence import SequenceOutputs
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
-
-class LlamaMLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-    ):
-        super().__init__()
-        self.gate_up_proj = ColumnParallelLinear(hidden_size, 2 * intermediate_size,
-                                                 bias=False, gather_output=False,
-                                                 perform_initialization=False)
-        self.down_proj = RowParallelLinear(intermediate_size, hidden_size,
-                                           bias=False, input_is_parallel=True,
-                                           perform_initialization=False)
-        if hidden_act != 'silu':
-            raise ValueError(f'Unsupported activation: {hidden_act}. '
-                             'Only silu is supported for now.')
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
-
-
-class LlamaAttention(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-    ):
-        super().__init__()
-        self.hidden_size = hidden_size
-        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
-        self.head_dim = hidden_size // self.total_num_heads
-        self.scaling = self.head_dim ** -0.5
-
-        self.qkv_proj = ColumnParallelLinear(
-            hidden_size,
-            3 * self.total_num_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            perform_initialization=False,
-        )
-        self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
-            input_is_parallel=True,
-            perform_initialization=False,
-        )
-        self.attn = LlamaCacheFlowAttention(self.scaling, self.head_dim)
-
-    def forward(
-        self,
-        positions: torch.LongTensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.chunk(chunks=3, dim=-1)
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(
-            positions, q, k, v, k_cache, v_cache, input_metadata, cache_event)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class LlamaDecoderLayer(nn.Module):
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = LlamaAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-        )
-        self.mlp = LlamaMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-        )
-        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.LongTensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        # Self Attention
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-            cache_event=cache_event,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-        return hidden_states
-
-
-class LlamaModel(nn.Module):
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = VocabParallelEmbedding(config.vocab_size, config.hidden_size,
-                                                   perform_initialization=False)
-        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        positions: torch.LongTensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        for i in range(len(self.layers)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
-            layer = self.layers[i]
-            hidden_states = layer(
-                positions,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-                cache_event,
-            )
-        hidden_states = self.norm(hidden_states)
-        return hidden_states
-
-
-class LlamaForCausalLM(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.model = LlamaModel(config)
-        self.lm_head = ColumnParallelLinear(config.hidden_size,
-                                            config.vocab_size,
-                                            bias=False,
-                                            gather_output=False,
-                                            perform_initialization=False)
-        self.sampler = Sampler()
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        positions: torch.LongTensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> Dict[int, SequenceOutputs]:
-        hidden_states = self.model(
-            input_ids, positions, kv_caches, input_metadata, cache_events)
-        next_tokens = self.sampler(
-            self.lm_head.weight, hidden_states, input_metadata)
-        return next_tokens
-
-    _column_parallel_weights = ["embed_tokens.weight", "lm_head.weight",
-                                "qkv_proj.weight", "gate_proj.weight",
-                                "up_proj.weight"]
-    _row_parallel_weights = ["o_proj.weight", "down_proj.weight"]
-
-    def load_weights(self, weights_path: str):
-        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
-        state_dict = self.state_dict()
-        for name, param in state_dict.items():
-            if "qkv_proj" in name or "gate_up_proj" in name:
-                if "qkv_proj" in name:
-                    original_name = "qkv_proj"
-                    weight_names = ["q_proj", "k_proj", "v_proj"]
-                    shard_size = param.shape[0] // 3
-                else:
-                    original_name = "gate_up_proj"
-                    weight_names = ["gate_proj", "up_proj"]
-                    shard_size = param.shape[0] // 2
-                weights_to_concat = []
-                for weight_name in weight_names:
-                    weight = np.load(os.path.join(
-                        weights_path, name.replace(original_name, weight_name)))
-                    weights_to_concat.append(weight[
-                        shard_size * tensor_model_parallel_rank
-                        :shard_size * (tensor_model_parallel_rank + 1)])
-                loaded_weight = torch.from_numpy(
-                    np.concatenate(weights_to_concat, axis=0))
-            else:
-                loaded_weight = torch.from_numpy(
-                    np.load(os.path.join(weights_path, name)))
-                for p in self._column_parallel_weights:
-                    if p in name:
-                        shard_size = param.shape[0]
-                        loaded_weight = loaded_weight[
-                            shard_size * tensor_model_parallel_rank
-                            :shard_size * (tensor_model_parallel_rank + 1)]
-                        break
-                for p in self._row_parallel_weights:
-                    if p in name:
-                        shard_size = param.shape[1]
-                        loaded_weight = loaded_weight[
-                            :,
-                            shard_size * tensor_model_parallel_rank
-                            :shard_size * (tensor_model_parallel_rank + 1)]
-                        break
-
-            assert param.shape == loaded_weight.shape
-            param.data.copy_(loaded_weight)
-
-    @staticmethod
-    def get_weights(model_name: str, path: str):
-        if not os.path.isfile(os.path.join(model_name, "config.json")):
-            raise ValueError("LLaMA model's model_name has to be a path"
-                             "to the huggingface model's directory.")
-        path = os.path.join(model_name, f"np")
-        path = os.path.abspath(os.path.expanduser(path))
-        os.makedirs(path, exist_ok=True)
-        lock_path = os.path.join(path, "file_lock")
-        lock = filelock.FileLock(lock_path)
-
-        with lock:
-            test_weight_path = os.path.join(path, "model.embed_tokens.weight")
-            if os.path.exists(test_weight_path):
-                return path
-
-            bin_files = glob.glob(os.path.join(model_name, "*.bin"))
-
-            for bin_file in tqdm(bin_files, desc="Convert format"):
-                state = torch.load(bin_file, map_location="cpu")
-                for name, param in tqdm(state.items(), leave=False):
-                    param_path = os.path.join(path, name)
-                    with open(param_path, "wb") as f:
-                        np.save(f, param.cpu().detach().numpy())
-
-            return path
-
-    def initialize_dummy_weights(self) -> None:
-        for param in self.state_dict().values():
-            param.data.uniform_(-0.1, 0.1)
--- a/cacheflow/models/memory_analyzer.py
+++ b/cacheflow/models/memory_analyzer.py
@@ -1,240 +0,0 @@
-import torch
-from transformers import AutoConfig
-
-from cacheflow.models.utils import get_dtype_size
-
-_GiB = 1 << 30
-
-
-class CacheFlowMemoryAnalyzer:
-
-    def get_max_num_gpu_blocks(
-        self,
-        max_num_batched_tokens: int,
-        memory_utilization: float,
-    ) -> int:
-        raise NotImplementedError()
-
-    def get_workspace_size(self) -> int:
-        return 1 * _GiB
-
-    def get_cache_block_size(self) -> int:
-        raise NotImplementedError()
-
-    def get_max_num_cpu_blocks(
-        self,
-        swap_space: int,
-    ) -> int:
-        swap_space = swap_space * _GiB
-        cpu_memory = self.cpu_memory
-        if swap_space > 0.8 * cpu_memory:
-            raise ValueError(f'The swap space ({swap_space / _GiB:.2f} GiB) '
-                             'takes more than 80% of the available memory '
-                             f'({cpu_memory / _GiB:.2f} GiB).'
-                             'Please check the swap space size.')
-        if swap_space > 0.5 * cpu_memory:
-            print(f'WARNING: The swap space ({swap_space / _GiB:.2f} GiB) '
-                  'takes more than 50% of the available memory '
-                  f'({cpu_memory / _GiB:.2f} GiB).'
-                  'This may slow the system performance.')
-        max_num_blocks = swap_space // self.get_cache_block_size()
-        return max_num_blocks
-
-
-class OPTMemoryAnalyzer(CacheFlowMemoryAnalyzer):
-
-    def __init__(
-        self,
-        model_name: str,
-        block_size: int,
-        dtype: torch.dtype,
-        gpu_memory: int,
-        cpu_memory: int,
-        tensor_parallel_size: int,
-    ) -> None:
-        self.model_name = model_name
-        self.block_size = block_size
-        self.dtype = dtype
-        self.gpu_memory = gpu_memory
-        self.cpu_memory = cpu_memory
-        self.tensor_parallel_size = tensor_parallel_size
-
-        config = AutoConfig.from_pretrained(model_name)
-        self.num_layers = config.num_hidden_layers
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_size = config.hidden_size // self.num_heads
-        self.ffn_size = config.ffn_dim
-        self.embedding_size = config.word_embed_proj_dim
-        self.vocab_size = config.vocab_size
-        self.max_position = config.max_position_embeddings
-
-    def _get_param_size(self) -> int:
-        word_embedding = self.vocab_size * self.embedding_size // self.tensor_parallel_size
-        if self.embedding_size != self.hidden_size:
-            # Project in/out.
-            word_embedding += 2 * self.embedding_size * self.hidden_size
-        position_embedding = self.max_position * self.hidden_size
-
-        ln1 = 2 * self.hidden_size
-        q = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
-        k = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
-        v = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
-        out = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
-        mha = ln1 + q + k + v + out
-
-        ln2 = 2 * self.hidden_size
-        ffn1 = self.hidden_size * self.ffn_size // self.tensor_parallel_size + self.ffn_size
-        ffn2 = self.ffn_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
-        ffn = ln2 + ffn1 + ffn2
-
-        total = (word_embedding + position_embedding +
-                 self.num_layers * (mha + ffn))
-        dtype_size = get_dtype_size(self.dtype)
-        return dtype_size * total
-
-    def _get_max_act_size(
-        self,
-        max_num_batched_tokens: int,
-    ) -> int:
-        # NOTE: We approxmiately calculate the maximum activation size by
-        # estimating
-        # 1) the maximum activation tensor size during inference
-        # 2) the residual tensor size during inference
-        # Here, we assume that FlashAttention is used and
-        # thus the attention maps are never materialized in GPU DRAM.
-        residual = max_num_batched_tokens * self.hidden_size
-        qkv = 3 * (max_num_batched_tokens * self.hidden_size) // self.tensor_parallel_size
-        ffn = max_num_batched_tokens * self.ffn_size // self.tensor_parallel_size
-        # Double the activation size for input and output.
-        max_act = 2 * (max(qkv, ffn) + residual)
-        # Size of output logits.
-        output_logits = 2 * (max_num_batched_tokens * self.vocab_size)
-        max_act = max(max_act, output_logits)
-        dtype_size = get_dtype_size(self.dtype)
-        return dtype_size * max_act
-
-    def get_cache_block_size(self) -> int:
-        key_cache_block = self.block_size * self.hidden_size // self.tensor_parallel_size
-        value_cache_block = key_cache_block
-        total = self.num_layers * (key_cache_block + value_cache_block)
-        dtype_size = get_dtype_size(self.dtype)
-        return dtype_size * total
-
-    def get_max_num_gpu_blocks(
-        self,
-        max_num_batched_tokens: int,
-        memory_utilization: float = 0.95,
-    ) -> int:
-        # NOTE(woosuk): This assumes that the machine has homogeneous GPUs.
-        usable_memory = int(memory_utilization * self.gpu_memory)
-
-        param_size = self._get_param_size()
-        act_size = self._get_max_act_size(max_num_batched_tokens)
-        workspace_size = self.get_workspace_size()
-
-        max_cache_size = usable_memory - (param_size + act_size + workspace_size)
-        if max_cache_size <= 0:
-            raise RuntimeError('Not enough GPU memory.')
-        max_num_blocks = max_cache_size // self.get_cache_block_size()
-        return max_num_blocks
-
-
-class LlamaMemoryAnalyzer(CacheFlowMemoryAnalyzer):
-
-    def __init__(
-        self,
-        model_name: str,
-        block_size: int,
-        dtype: torch.dtype,
-        gpu_memory: int,
-        cpu_memory: int,
-        tensor_parallel_size: int,
-    ) -> None:
-        self.model_name = model_name
-        self.block_size = block_size
-        self.dtype = dtype
-        self.gpu_memory = gpu_memory
-        self.cpu_memory = cpu_memory
-        self.tensor_parallel_size = tensor_parallel_size
-
-        config = AutoConfig.from_pretrained(model_name)
-        self.num_layers = config.num_hidden_layers
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_size = config.hidden_size // self.num_heads
-        self.ffn_size = config.intermediate_size
-        self.vocab_size = config.vocab_size
-        self.max_position = 8192
-
-    def _get_param_size(self) -> int:
-        word_embedding = self.vocab_size * self.hidden_size // self.tensor_parallel_size
-        position_embedding = self.max_position * self.hidden_size
-
-        # NOTE: LLaMA does not have bias terms.
-        ln1 = self.hidden_size
-        q = self.hidden_size * self.hidden_size // self.tensor_parallel_size
-        k = self.hidden_size * self.hidden_size // self.tensor_parallel_size
-        v = self.hidden_size * self.hidden_size // self.tensor_parallel_size
-        out = self.hidden_size * self.hidden_size // self.tensor_parallel_size
-        # Rotary embedding.
-        # TODO(woosuk): Share the rotary embedding between layers.
-        rot = self.max_position * self.head_size
-        mha = ln1 + q + k + v + out + rot
-
-        ln2 = self.hidden_size
-        gate = self.hidden_size * self.ffn_size // self.tensor_parallel_size
-        down = self.ffn_size * self.hidden_size // self.tensor_parallel_size
-        up = self.hidden_size * self.ffn_size // self.tensor_parallel_size
-        ffn = ln2 + gate + down + up
-
-        total = (word_embedding + position_embedding + self.num_layers * (mha + ffn))
-        dtype_size = get_dtype_size(self.dtype)
-        return dtype_size * total
-
-    def _get_max_act_size(
-        self,
-        max_num_batched_tokens: int,
-    ) -> int:
-        # NOTE: We approxmiately calculate the maximum activation size by
-        # estimating
-        # 1) the maximum activation tensor size during inference
-        # 2) the residual tensor size during inference
-        # Here, we assume that FlashAttention is used and
-        # thus the attention maps are never materialized in GPU DRAM.
-        residual = max_num_batched_tokens * self.hidden_size
-        qkv = 3 * (max_num_batched_tokens * self.hidden_size) // self.tensor_parallel_size
-        ffn = 2 * (max_num_batched_tokens * self.ffn_size) // self.tensor_parallel_size
-        # Double the activation size for input and output.
-        max_act = 2 * (max(qkv, ffn) + residual)
-        # Size of output logits.
-        output_logits = 2 * (max_num_batched_tokens * self.vocab_size)
-        max_act = max(max_act, output_logits)
-        dtype_size = get_dtype_size(self.dtype)
-        return dtype_size * max_act
-
-    def get_cache_block_size(self) -> int:
-        key_cache_block = self.block_size * self.hidden_size // self.tensor_parallel_size
-        value_cache_block = key_cache_block
-        total = self.num_layers * (key_cache_block + value_cache_block)
-        dtype_size = get_dtype_size(self.dtype)
-        return dtype_size * total
-
-    def get_max_num_gpu_blocks(
-        self,
-        max_num_batched_tokens: int,
-        memory_utilization: float = 0.95,
-    ) -> int:
-        # NOTE(woosuk): This assumes that the machine has homogeneous GPUs.
-        gpu_memory = self.gpu_memory
-        usable_memory = int(memory_utilization * gpu_memory)
-
-        param_size = self._get_param_size()
-        act_size = self._get_max_act_size(max_num_batched_tokens)
-        workspace_size = self.get_workspace_size()
-
-        max_cache_size = usable_memory - (param_size + act_size + workspace_size)
-        if max_cache_size <= 0:
-            raise RuntimeError('Not enough GPU memory.')
-        max_num_blocks = max_cache_size // self.get_cache_block_size()
-        return max_num_blocks
--- a/cacheflow/models/model_utils.py
+++ b/cacheflow/models/model_utils.py
@@ -1,72 +0,0 @@
-from typing import Union
-
-import numpy as np
-import torch
-import torch.nn as nn
-from transformers import AutoConfig
-
-from cacheflow.models.memory_analyzer import CacheFlowMemoryAnalyzer
-from cacheflow.models.memory_analyzer import LlamaMemoryAnalyzer
-from cacheflow.models.memory_analyzer import OPTMemoryAnalyzer
-from cacheflow.models.llama import LlamaForCausalLM
-from cacheflow.models.opt import OPTForCausalLM
-from cacheflow.models.utils import get_torch_dtype
-
-
-_MODELS = {
-    'llama': LlamaForCausalLM,
-    'opt': OPTForCausalLM,
-}
-
-_MEMORY_ANALYZERS = {
-    'llama': LlamaMemoryAnalyzer,
-    'opt': OPTMemoryAnalyzer,
-}
-
-
-def get_model(
-    model_name: str,
-    dtype: Union[torch.dtype, str],
-    path: str,
-    use_dummy_weights: bool,
-) -> nn.Module:
-    torch_dtype = get_torch_dtype(dtype)
-    torch.set_default_dtype(torch_dtype)
-    config = AutoConfig.from_pretrained(model_name)
-    for model_class_name, model_class in _MODELS.items():
-        if model_class_name in model_name:
-            if use_dummy_weights:
-                # Create a model instance.
-                # The weights will be initialized as empty tensors.
-                model = model_class(config)
-                model = model.cuda()
-                # NOTE(woosuk): For precise performance evaluation, we assign
-                # random values to the weights. 
-                model.initialize_dummy_weights()
-            else:
-                # Download model weights if it's not cached.
-                weights_dir = model_class.get_weights(model_name, path=path)
-                # Create a model instance.
-                model = model_class(config)
-                # Load the weights from the cached or downloaded files.
-                model.load_weights(weights_dir)
-                model = model.cuda()
-            return model.eval(), torch_dtype
-    raise ValueError(f'Unsupported model name: {model_name}')
-
-
-def get_memory_analyzer(
-    model_name: str,
-    block_size: int,
-    dtype: Union[torch.dtype, str],
-    gpu_memory: int,
-    cpu_memory: int,
-    tensor_parallel_size: int = 1,
-) -> CacheFlowMemoryAnalyzer:
-    torch_dtype = get_torch_dtype(dtype)
-    for model_class, memory_analyzer in _MEMORY_ANALYZERS.items():
-        if model_class in model_name:
-            return memory_analyzer(
-                model_name, block_size, torch_dtype, gpu_memory, cpu_memory,
-                tensor_parallel_size)
-    raise ValueError(f'Unsupported model name: {model_name}')
--- a/cacheflow/models/opt.py
+++ b/cacheflow/models/opt.py
@@ -1,330 +0,0 @@
-"""1D OPT model compatible with HuggingFace weights."""
-import os
-import glob
-import filelock
-from tqdm import tqdm
-from typing import Dict, List, Optional, Tuple
-
-import numpy as np
-import torch
-from torch import nn
-from transformers import OPTConfig
-from huggingface_hub import snapshot_download
-
-from cacheflow.models import InputMetadata
-from cacheflow.models.attention import OPTCacheFlowAttention
-from cacheflow.models.sample import Sampler
-from cacheflow.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
-from cacheflow.parallel_utils.tensor_parallel import (VocabParallelEmbedding,
-                                                      ColumnParallelLinear,
-                                                      RowParallelLinear)
-from cacheflow.sequence import SequenceOutputs
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
-
-class OPTLearnedPositionalEmbedding(nn.Embedding):
-
-    def __init__(self, num_embeddings: int, embedding_dim: int):
-        # OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
-        # and adjust num_embeddings appropriately. Other models don't have this hack
-        self.offset = 2
-        super().__init__(num_embeddings + self.offset, embedding_dim)
-
-    def forward(self, positions: torch.LongTensor):
-        return super().forward(positions + self.offset)
-
-
-class OPTAttention(nn.Module):
-
-    def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        bias: bool = True,
-    ) -> None:
-        super().__init__()
-        self.embed_dim = embed_dim
-        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
-        total_num_heads = num_heads
-        assert num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = total_num_heads // tensor_model_parallel_world_size
-        self.head_dim = embed_dim // total_num_heads
-        self.scaling = self.head_dim ** -0.5
-
-        self.qkv_proj = ColumnParallelLinear(embed_dim, 3 * embed_dim, bias=bias,
-                                             gather_output=False,
-                                             perform_initialization=False)
-        self.out_proj = RowParallelLinear(embed_dim, embed_dim, bias=bias,
-                                          input_is_parallel=True,
-                                          perform_initialization=False)
-        self.attn = OPTCacheFlowAttention(scale=self.scaling)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.chunk(chunks=3, dim=-1)
-        key_cache, value_cache = kv_cache
-        attn_output = self.attn(
-            q, k, v, key_cache, value_cache, input_metadata, cache_event)
-        output, _ = self.out_proj(attn_output)
-        return output
-
-
-class OPTDecoderLayer(nn.Module):
-
-    def __init__(self, config: OPTConfig):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.hidden_size
-        self.self_attn = OPTAttention(
-            embed_dim=self.embed_dim,
-            num_heads=config.num_attention_heads,
-            bias=config.enable_bias,
-        )
-        self.do_layer_norm_before = config.do_layer_norm_before
-        assert config.activation_function == 'relu'
-        self.activation_fn = nn.ReLU()
-
-        self.self_attn_layer_norm = nn.LayerNorm(
-            self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine)
-        self.fc1 = ColumnParallelLinear(self.embed_dim, config.ffn_dim,
-                                        bias=config.enable_bias,
-                                        gather_output=False,
-                                        perform_initialization=False)
-        self.fc2 = RowParallelLinear(config.ffn_dim, self.embed_dim,
-                                     bias=config.enable_bias,
-                                     input_is_parallel=True,
-                                     perform_initialization=False)
-        self.final_layer_norm = nn.LayerNorm(
-            self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        # Self Attention
-        residual = hidden_states
-        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
-        if self.do_layer_norm_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
-        hidden_states = self.self_attn(
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-            cache_event=cache_event)
-        hidden_states = residual + hidden_states
-        # 350m applies layer norm AFTER attention
-        if not self.do_layer_norm_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
-
-        # Fully Connected
-        residual = hidden_states
-        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
-        if self.do_layer_norm_before:
-            hidden_states = self.final_layer_norm(hidden_states)
-        hidden_states, _ = self.fc1(hidden_states)
-        hidden_states = self.activation_fn(hidden_states)
-        hidden_states, _ = self.fc2(hidden_states)
-        hidden_states = residual + hidden_states
-        # 350m applies layer norm AFTER attention
-        if not self.do_layer_norm_before:
-            hidden_states = self.final_layer_norm(hidden_states)
-        return hidden_states
-
-
-class OPTDecoder(nn.Module):
-
-    def __init__(self, config: OPTConfig):
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.max_target_positions = config.max_position_embeddings
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
-                                                   config.word_embed_proj_dim,
-                                                   perform_initialization=False)
-        # Positional embeddings are replicated (not sharded).
-        self.embed_positions = OPTLearnedPositionalEmbedding(
-            config.max_position_embeddings, config.hidden_size)
-
-        # Project out & in will be replicated if they exist.
-        if config.word_embed_proj_dim != config.hidden_size:
-            self.project_out = nn.Linear(config.hidden_size, config.word_embed_proj_dim, bias=False)
-        else:
-            self.project_out = None
-
-        if config.word_embed_proj_dim != config.hidden_size:
-            self.project_in = nn.Linear(config.word_embed_proj_dim, config.hidden_size, bias=False)
-        else:
-            self.project_in = None
-
-        # Note that the only purpose of `config._remove_final_layer_norm` is to keep backward compatibility
-        # with checkpoints that have been fine-tuned before transformers v4.20.1
-        # see https://github.com/facebookresearch/metaseq/pull/164
-        if config.do_layer_norm_before and not config._remove_final_layer_norm:
-            self.final_layer_norm = nn.LayerNorm(
-                config.hidden_size, elementwise_affine=config.layer_norm_elementwise_affine
-            )
-        else:
-            self.final_layer_norm = None
-
-        self.layers = nn.ModuleList([OPTDecoderLayer(config) for _ in range(config.num_hidden_layers)])
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        positions: torch.LongTensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> torch.Tensor:
-        inputs_embeds = self.embed_tokens(input_ids)
-        pos_embeds = self.embed_positions(positions)
-        if self.project_in is not None:
-            inputs_embeds = self.project_in(inputs_embeds)
-        hidden_states = inputs_embeds + pos_embeds
-
-        for i in range(len(self.layers)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
-            layer = self.layers[i]
-            hidden_states = layer(
-                hidden_states, kv_caches[i], input_metadata, cache_event)
-
-        if self.final_layer_norm is not None:
-            hidden_states = self.final_layer_norm(hidden_states)
-        if self.project_out is not None:
-            hidden_states = self.project_out(hidden_states)
-        return hidden_states
-
-
-class OPTModel(nn.Module):
-
-    def __init__(self, config: OPTConfig):
-        super().__init__()
-        self.decoder = OPTDecoder(config)
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        positions: torch.LongTensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> torch.Tensor:
-        return self.decoder(
-            input_ids, positions, kv_caches, input_metadata, cache_events)
-
-
-class OPTForCausalLM(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.model = OPTModel(config)
-        # TODO(zhuohan): create a new weight after implementing pipeline
-        #                parallelism
-        self.lm_head_weight = self.model.decoder.embed_tokens.weight
-        self.sampler = Sampler()
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        positions: torch.LongTensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> Dict[int, SequenceOutputs]:
-        hidden_states = self.model(
-            input_ids, positions, kv_caches, input_metadata, cache_events)
-        next_tokens = self.sampler(
-            self.lm_head_weight, hidden_states, input_metadata)
-        return next_tokens
-
-    _column_parallel_weights = ["embed_tokens.weight", "fc1.weight", "fc1.bias"]
-    _row_parallel_weights = ["out_proj.weight", "fc2.weight"]
-
-    def load_weights(self, weights_path: str):
-        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
-        state_dict = self.state_dict()
-        for name, param in state_dict.items():
-            if "lm_head_weight" in name:
-                continue
-            if "qkv_proj" in name:
-                shard_size = param.shape[0] // 3
-                weights_to_concat = []
-                for weight_name in ["q_proj", "k_proj", "v_proj"]:
-                    weight = np.load(os.path.join(
-                        weights_path, name.replace("qkv_proj", weight_name)))
-                    weights_to_concat.append(weight[
-                        shard_size * tensor_model_parallel_rank
-                        :shard_size * (tensor_model_parallel_rank + 1)])
-                loaded_weight = torch.from_numpy(
-                    np.concatenate(weights_to_concat, axis=0))
-            else:
-                loaded_weight = torch.from_numpy(
-                    np.load(os.path.join(weights_path, name)))
-                for p in self._column_parallel_weights:
-                    if p in name:
-                        shard_size = param.shape[0]
-                        loaded_weight = loaded_weight[
-                            shard_size * tensor_model_parallel_rank
-                            :shard_size * (tensor_model_parallel_rank + 1)]
-                        break
-                for p in self._row_parallel_weights:
-                    if p in name:
-                        shard_size = param.shape[1]
-                        loaded_weight = loaded_weight[
-                            :,
-                            shard_size * tensor_model_parallel_rank
-                            :shard_size * (tensor_model_parallel_rank + 1)]
-                        break
-
-            assert param.shape == loaded_weight.shape
-            param.data.copy_(loaded_weight)
-
-    @staticmethod
-    def get_weights(model_name: str, path: str):
-        path = os.path.join(path, f"{model_name}-np")
-        path = os.path.abspath(os.path.expanduser(path))
-        os.makedirs(path, exist_ok=True)
-        lock_path = os.path.join(path, "file_lock")
-        lock = filelock.FileLock(lock_path)
-
-        with lock:
-            test_weight_path = os.path.join(
-                path, "model.decoder.embed_positions.weight")
-            if os.path.exists(test_weight_path):
-                return path
-
-            folder = snapshot_download(model_name, allow_patterns="*.bin",
-                                       cache_dir=os.path.join(path, "cache"))
-            bin_files = glob.glob(os.path.join(folder, "*.bin"))
-
-            for bin_file in tqdm(bin_files, desc="Convert format"):
-                state = torch.load(bin_file, map_location="cpu")
-                for name, param in tqdm(state.items(), leave=False):
-                    if name.startswith("decoder."):
-                        name = "model." + name
-                    param_path = os.path.join(path, name)
-                    with open(param_path, "wb") as f:
-                        np.save(f, param.cpu().detach().numpy())
-
-            return path
-
-    def initialize_dummy_weights(self) -> None:
-        for param in self.state_dict().values():
-            param.data.uniform_(-0.1, 0.1)
--- a/cacheflow/models/sample.py
+++ b/cacheflow/models/sample.py
@@ -1,287 +0,0 @@
-from typing import Dict, List, Tuple
-
-import torch
-import torch.nn as nn
-
-from cacheflow.models import InputMetadata
-from cacheflow.sampling_params import SamplingParams
-from cacheflow.sequence import SequenceOutputs
-from cacheflow.parallel_utils.tensor_parallel import gather_from_tensor_model_parallel_region
-
-
-class Sampler(nn.Module):
-
-    def __init__(self) -> None:
-        super().__init__()
-
-    def forward(
-        self,
-        embedding: torch.Tensor,
-        hidden_states: torch.Tensor,
-        input_metadata: InputMetadata,
-    ) -> Dict[int, SequenceOutputs]:
-        # Get the hidden states that we use for sampling.
-        hidden_states = _prune_hidden_states(hidden_states, input_metadata)
-
-        # Get the logits for the next tokens.
-        logits = torch.matmul(hidden_states, embedding.t())
-        logits = gather_from_tensor_model_parallel_region(logits)
-
-        # Apply temperature scaling.
-        temperatures = _get_temperatures(input_metadata)
-        assert len(temperatures) == logits.shape[0]
-        if any(t != 1.0 for t in temperatures):
-            t = torch.tensor(
-                temperatures, dtype=logits.dtype, device=logits.device)
-            # Use in-place division to avoid creating a new tensor.
-            logits.div_(t.unsqueeze(dim=1))
-
-        # We use float32 for probabilities and log probabilities.
-        # Compute the probabilities.
-        probs = torch.softmax(logits, dim=-1, dtype=torch.float)
-        # Compute the log probabilities (before applying top-p).
-        logprobs = torch.log(probs)
-
-        # Apply top-p truncation.
-        top_ps = _get_top_ps(input_metadata)
-        assert len(top_ps) == probs.shape[0]
-        if any(p < 1.0 for p in top_ps):
-            p = torch.tensor(top_ps, dtype=probs.dtype, device=probs.device)
-            probs = _apply_top_p(probs, p)
-
-        # Sample the next tokens.
-        return _sample(probs, logprobs, input_metadata)
-
-
-def _prune_hidden_states(
-    hidden_states: torch.Tensor,
-    input_metadata: InputMetadata,
-) -> torch.Tensor:
-    start_idx = 0
-    last_token_indicies: List[int] = []
-    for prompt_len in input_metadata.prompt_lens:
-        last_token_indicies.append(start_idx + prompt_len - 1)
-        start_idx += prompt_len
-    last_token_indicies.extend(
-        range(start_idx, start_idx + input_metadata.num_generation_tokens))
-    return hidden_states[last_token_indicies]
-
-
-def _get_temperatures(
-    input_metadata: InputMetadata,
-) -> List[float]:
-    # Collect the temperatures for the logits.
-    temperatures: List[float] = []
-    for i, seq_group in enumerate(input_metadata.seq_groups):
-        seq_ids, sampling_params = seq_group
-        temperature = sampling_params.temperature
-        if temperature == 0.0:
-            # NOTE: Zero temperature means deterministic sampling
-            # (i.e., greedy sampling or beam search).
-            # Set the temperature to 1 to avoid division by zero.
-            temperature = 1.0
-
-        if i < input_metadata.num_prompts:
-            # A prompt input.
-            temperatures.append(temperature)
-        else:
-            # A generation token.
-            temperatures += [temperature] * len(seq_ids)
-    return temperatures
-
-
-def _get_top_ps(
-    input_metadata: InputMetadata,
-) -> List[float]:
-    top_ps: List[float] = []
-    for i, seq_group in enumerate(input_metadata.seq_groups):
-        seq_ids, sampling_params = seq_group
-        if i < input_metadata.num_prompts:
-            # A prompt input.
-            top_ps.append(sampling_params.top_p)
-        else:
-            # A generation token.
-            top_ps += [sampling_params.top_p] * len(seq_ids)
-    return top_ps
-
-
-def _apply_top_p(
-    probs: torch.Tensor,
-    p: torch.Tensor,
-) -> torch.Tensor:
-    # TODO(woosuk): Optimize.
-    probs_sort, probs_idx = probs.sort(dim=-1, descending=True)
-    probs_sum = torch.cumsum(probs_sort, dim=-1)
-    mask = (probs_sum - probs_sort) > p.unsqueeze(dim=1)
-    probs_sort[mask] = 0.0
-    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
-    probs = torch.gather(
-        probs_sort, dim=-1, index=torch.argsort(probs_idx, dim=-1))
-    return probs
-
-
-def _get_topk_logprobs(
-    logprobs: torch.Tensor,
-    num_logprobs: int,
-) -> Dict[int, float]:
-    if num_logprobs == 0:
-        return {}
-
-    topk_logprobs, topk_ids = torch.topk(logprobs, num_logprobs)
-    if num_logprobs == 1:
-        topk_logprobs = [topk_logprobs.item()]
-        topk_ids = [topk_ids.item()]
-    else:
-        topk_logprobs = topk_logprobs.tolist()
-        topk_ids = topk_ids.tolist()
-
-    token_to_logprob: Dict[int, float] = {}
-    for token_id, logprob in zip(topk_ids, topk_logprobs):
-        token_to_logprob[token_id] = logprob
-    return token_to_logprob
-
-
-def _sample_from_prompt(
-    prob: torch.Tensor,
-    sampling_params: SamplingParams,
-) -> List[int]:
-    if sampling_params.use_beam_search:
-        # Beam search.
-        beam_width = sampling_params.n
-        _, next_token_ids = torch.topk(prob, beam_width)
-        next_token_ids = next_token_ids.tolist()
-    elif sampling_params.temperature == 0.0:
-        # Greedy sampling.
-        assert sampling_params.n == 1
-        next_token_id = torch.argmax(prob)
-        next_token_ids = [next_token_id.item()]
-    else:
-        # Neucleus sampling.
-        # Sample n tokens for the prompt.
-        n = sampling_params.n
-        next_token_ids = torch.multinomial(
-            prob, num_samples=n, replacement=True)
-        next_token_ids = next_token_ids.tolist()
-    return next_token_ids
-
-
-def _sample_from_generation_tokens(
-    seq_ids: List[int],
-    probs: torch.Tensor,
-    logprobs: torch.Tensor,
-    seq_logprobs: List[float],
-    sampling_params: SamplingParams,
-) -> Tuple[List[int], List[int]]:
-    # NOTE(woosuk): sampling_params.n can be greater than
-    # len(seq_ids) because some sequences in the group might have
-    # been already terminated.
-    if sampling_params.use_beam_search:
-        # Beam search.
-        # Add cumulative logprobs for the sequences in the group.
-        seq_logprobs = torch.tensor(
-            seq_logprobs, dtype=torch.float, device=logprobs.device)
-        logprobs = logprobs + seq_logprobs.unsqueeze(dim=1)
-
-        vocab_size = logprobs.size(-1)
-        beam_width = len(seq_ids)
-        _, topk_ids = torch.topk(logprobs.flatten(), beam_width)
-        topk_ids = topk_ids.tolist()
-        seq_idx = [i // vocab_size for i in topk_ids]
-        beam_seq_ids = [seq_ids[i] for i in seq_idx]
-        token_ids = [i % vocab_size for i in topk_ids]
-
-        beam_outputs: Dict[int, Tuple[int, int]] = {}
-        outstanding_beams: List[Tuple[int, int]] = []
-        # If a beam survives, continue with it.
-        for seq_id, token_id in zip(beam_seq_ids, token_ids):
-            if seq_id not in beam_outputs:
-                beam_outputs[seq_id] = (seq_id, token_id)
-            else:
-                outstanding_beams.append((seq_id, token_id))
-
-        # If a beam is discarded, fork another beam.
-        for seq_id in seq_ids:
-            if seq_id not in beam_outputs:
-                beam_outputs[seq_id] = outstanding_beams.pop()
-        assert not outstanding_beams
-
-        parent_seq_ids = [beam_outputs[seq_id][0] for seq_id in seq_ids]
-        next_token_ids = [beam_outputs[seq_id][1] for seq_id in seq_ids]
-    elif sampling_params.temperature == 0.0:
-        # Greedy sampling.
-        assert len(seq_ids) == 1
-        next_token_id = torch.argmax(probs, dim=-1)
-        next_token_ids = [next_token_id.item()]
-        parent_seq_ids = seq_ids
-    else:
-        # Neucleus sampling.
-        # Sample 1 token for each sequence in the group.
-        next_token_ids = torch.multinomial(
-            probs, num_samples=1, replacement=True)
-        next_token_ids = next_token_ids.squeeze(dim=-1).tolist()
-        parent_seq_ids = seq_ids
-    return parent_seq_ids, next_token_ids
-
-
-def _sample(
-    probs: torch.Tensor,
-    logprobs: torch.Tensor,
-    input_metadata: InputMetadata,
-) -> Dict[int, SequenceOutputs]:
-    seq_outputs: Dict[int, SequenceOutputs] = {}
-
-    # TODO(woosuk): Optimize.
-    idx = 0
-    for i, seq_group in enumerate(input_metadata.seq_groups):
-        seq_ids, sampling_params = seq_group
-        if i < input_metadata.num_prompts:
-            # Generate the next tokens for a prompt input.
-            assert len(seq_ids) == sampling_params.n
-            prob = probs[idx]
-            logprob = logprobs[idx]
-            idx += 1
-
-            # Sample the next tokens.
-            next_token_ids = _sample_from_prompt(prob, sampling_params)
-            # Get top-k log probabilities for the next tokens.
-            next_logprobs = _get_topk_logprobs(
-                logprob, sampling_params.num_logprobs)
-
-            # Build the output.
-            for seq_id, next_token_id in zip(seq_ids, next_token_ids):
-                output_logprobs = next_logprobs.copy()
-                output_logprobs[next_token_id] = logprob[next_token_id].item()
-                seq_outputs[seq_id] = SequenceOutputs(
-                    seq_id, seq_id, next_token_id, output_logprobs)
-        else:
-            # Generate the next tokens for generation tokens.
-            prob = probs[idx:idx + len(seq_ids)]
-            logprob = logprobs[idx:idx + len(seq_ids)]
-            idx += len(seq_ids)
-
-            # Sample the next tokens.
-            seq_logprobs = [
-                input_metadata.seq_logprobs[seq_id] for seq_id in seq_ids]
-            parent_seq_ids, next_token_ids = _sample_from_generation_tokens(
-                seq_ids, prob, logprob, seq_logprobs, sampling_params)
-
-            # Get top-k log probabilities for the next tokens.
-            next_logprobs: Dict[int, Dict[int, float]] = {}
-            for i, seq_id in enumerate(seq_ids):
-                next_logprobs[seq_id] = _get_topk_logprobs(
-                    logprob[i], sampling_params.num_logprobs)
-
-            # Build the output.
-            for seq_id, parent_seq_id, next_token_id in zip(
-                seq_ids, parent_seq_ids, next_token_ids):
-                i = seq_ids.index(parent_seq_id)
-                output_logprobs = next_logprobs[parent_seq_id].copy()
-                output_logprobs[next_token_id] = logprob[i, next_token_id].item()
-                seq_outputs[seq_id] = SequenceOutputs(
-                    seq_id,
-                    parent_seq_id,
-                    next_token_id,
-                    output_logprobs,
-                )
-
-    return seq_outputs
--- a/cacheflow/models/utils.py
+++ b/cacheflow/models/utils.py
@@ -1,24 +0,0 @@
-from typing import Union
-
-import torch
-
-_STR_DTYPE_TO_TORCH_DTYPE = {
-    'half': torch.half,
-    'float': torch.float,
-    'float16': torch.float16,
-    'float32': torch.float32,
-}
-
-
-def get_torch_dtype(dtype: Union[torch.dtype, str]) -> torch.dtype:
-    if isinstance(dtype, str):
-        torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype.lower()]
-    else:
-        torch_dtype = dtype
-    return torch_dtype
-
-
-def get_dtype_size(dtype: Union[torch.dtype, str]) -> int:
-    torch_dtype = get_torch_dtype(dtype)
-    return torch.tensor([], dtype=torch_dtype).element_size()
-
--- a/cacheflow/parallel_utils/README.md
+++ b/cacheflow/parallel_utils/README.md
@@ -1 +0,0 @@
-The files in this folder are ported from [Megatron-LM](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core). We only keep the codes that are used in inference.
--- a/cacheflow/parallel_utils/init.py
+++ b/cacheflow/parallel_utils/init.py
@@ -1,12 +0,0 @@
-import cacheflow.parallel_utils.parallel_state
-import cacheflow.parallel_utils.tensor_parallel
-import cacheflow.parallel_utils.utils
-
-# Alias parallel_state as mpu, its legacy name
-mpu = parallel_state
-
-__all__ = [
-    "parallel_state",
-    "tensor_parallel",
-    "utils",
-]
--- a/cacheflow/parallel_utils/parallel_state.py
+++ b/cacheflow/parallel_utils/parallel_state.py
@@ -1,593 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Model and data parallel groups."""
-
-import torch
-from typing import Optional
-
-from .utils import GlobalMemoryBuffer
-
-# Intra-layer model parallel group that the current rank belongs to.
-_TENSOR_MODEL_PARALLEL_GROUP = None
-# Inter-layer model parallel group that the current rank belongs to.
-_PIPELINE_MODEL_PARALLEL_GROUP = None
-# Model parallel group (both intra- and pipeline) that the current rank belongs to.
-_MODEL_PARALLEL_GROUP = None
-# Embedding group.
-_EMBEDDING_GROUP = None
-# Position embedding group.
-_POSITION_EMBEDDING_GROUP = None
-# Data parallel group that the current rank belongs to.
-_DATA_PARALLEL_GROUP = None
-
-_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = None
-_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
-_PIPELINE_MODEL_PARALLEL_SPLIT_RANK = None
-
-# These values enable us to change the mpu sizes on the fly.
-_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = None
-_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
-_MPU_TENSOR_MODEL_PARALLEL_RANK = None
-_MPU_PIPELINE_MODEL_PARALLEL_RANK = None
-
-# A list of ranks that have a copy of the embedding.
-_EMBEDDING_GLOBAL_RANKS = None
-
-# A list of ranks that have a copy of the position embedding.
-_POSITION_EMBEDDING_GLOBAL_RANKS = None
-
-# A list of global ranks for each pipeline group to ease calculation of the source
-# rank when broadcasting from the first or last pipeline stage.
-_PIPELINE_GLOBAL_RANKS = None
-
-# A list of global ranks for each data parallel group to ease calculation of the source
-# rank when broadcasting weights from src to all other data parallel ranks
-_DATA_PARALLEL_GLOBAL_RANKS = None
-
-# Memory buffers to avoid dynamic memory allocation
-_GLOBAL_MEMORY_BUFFER = None
-
-_ALL_REDUCE_LAUNCHER: Optional['GraphAllReduce'] = None
-
-def initialize_model_parallel(
-    tensor_model_parallel_size: int = 1,
-    pipeline_model_parallel_size: int = 1,
-    virtual_pipeline_model_parallel_size: Optional[int] = None,
-    pipeline_model_parallel_split_rank: Optional[int] = None,
-) -> None:
-    """
-    Initialize model data parallel groups.
-
-    Arguments:
-        tensor_model_parallel_size: number of GPUs used for tensor model parallelism.
-        pipeline_model_parallel_size: number of GPUs used for pipeline model parallelism.
-        virtual_pipeline_model_parallel_size: number of virtual stages (interleaved
-                                              pipeline).
-        pipeline_model_parallel_split_rank: for models with both encoder and decoder,
-                                            rank in pipeline with split point.
-
-    Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we
-    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
-    the model pipeline. The present function will
-    create 8 tensor model-parallel groups, 4 pipeline model-parallel groups
-    and 8 data-parallel groups as:
-        8 data_parallel groups:
-            [g0, g2], [g1, g3], [g4, g6], [g5, g7], [g8, g10], [g9, g11], [g12, g14], [g13, g15]
-        8 tensor model-parallel groups:
-            [g0, g1], [g2, g3], [g4, g5], [g6, g7], [g8, g9], [g10, g11], [g12, g13], [g14, g15]
-        4 pipeline model-parallel groups:
-            [g0, g4, g8, g12], [g1, g5, g9, g13], [g2, g6, g10, g14], [g3, g7, g11, g15]
-    Note that for efficiency, the caller should make sure adjacent ranks
-    are on the same DGX box. For example if we are using 2 DGX-1 boxes
-    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
-    ranks 8 to 15 belong to the second box.
-    """
-    # Get world size and rank. Ensure some consistencies.
-    assert torch.distributed.is_initialized()
-    world_size: int = torch.distributed.get_world_size()
-
-    if world_size % (tensor_model_parallel_size * pipeline_model_parallel_size) != 0:
-        raise RuntimeError(
-            f"world_size ({world_size}) is not divisible by tensor_model_parallel_size "
-            f"({tensor_model_parallel_size}) x pipeline_model_parallel_size ({pipeline_model_parallel_size})"
-        )
-
-    data_parallel_size: int = world_size // (tensor_model_parallel_size *
-                                             pipeline_model_parallel_size)
-
-    num_tensor_model_parallel_groups: int  = world_size // tensor_model_parallel_size
-    num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
-    num_data_parallel_groups: int = world_size // data_parallel_size
-
-    if virtual_pipeline_model_parallel_size is not None:
-        if not pipeline_model_parallel_size > 2:
-            raise RuntimeError("pipeline-model-parallel size should be greater than 2 with "
-                               "interleaved schedule")
-        global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
-        global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
-        _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = 0
-        _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = virtual_pipeline_model_parallel_size
-
-    if pipeline_model_parallel_split_rank is not None:
-        global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
-        _PIPELINE_MODEL_PARALLEL_SPLIT_RANK = pipeline_model_parallel_split_rank
-
-    rank = torch.distributed.get_rank()
-
-    # Build the data-parallel groups.
-    global _DATA_PARALLEL_GROUP
-    global _DATA_PARALLEL_GLOBAL_RANKS
-    assert _DATA_PARALLEL_GROUP is None, 'data parallel group is already initialized'
-    all_data_parallel_group_ranks = []
-    for i in range(pipeline_model_parallel_size):
-        start_rank = i * num_pipeline_model_parallel_groups
-        end_rank = (i + 1) * num_pipeline_model_parallel_groups
-        for j in range(tensor_model_parallel_size):
-            ranks = range(start_rank + j, end_rank, tensor_model_parallel_size)
-            all_data_parallel_group_ranks.append(list(ranks))
-            group = torch.distributed.new_group(ranks)
-            if rank in ranks:
-                _DATA_PARALLEL_GROUP = group
-                _DATA_PARALLEL_GLOBAL_RANKS = ranks
-
-    # Build the model-parallel groups.
-    global _MODEL_PARALLEL_GROUP
-    assert _MODEL_PARALLEL_GROUP is None, 'model parallel group is already initialized'
-    for i in range(data_parallel_size):
-        ranks = [data_parallel_group_ranks[i]
-                 for data_parallel_group_ranks in all_data_parallel_group_ranks]
-        group = torch.distributed.new_group(ranks)
-        if rank in ranks:
-            _MODEL_PARALLEL_GROUP = group
-
-    # Build the tensor model-parallel groups.
-    global _TENSOR_MODEL_PARALLEL_GROUP
-    assert _TENSOR_MODEL_PARALLEL_GROUP is None, \
-        'tensor model parallel group is already initialized'
-    for i in range(num_tensor_model_parallel_groups):
-        ranks = range(i * tensor_model_parallel_size,
-                      (i + 1) * tensor_model_parallel_size)
-        group = torch.distributed.new_group(ranks)
-        if rank in ranks:
-            _TENSOR_MODEL_PARALLEL_GROUP = group
-
-    # Build the pipeline model-parallel groups and embedding groups
-    # (first and last rank in each pipeline model-parallel group).
-    global _PIPELINE_MODEL_PARALLEL_GROUP
-    global _PIPELINE_GLOBAL_RANKS
-    assert _PIPELINE_MODEL_PARALLEL_GROUP is None, \
-        'pipeline model parallel group is already initialized'
-    global _EMBEDDING_GROUP
-    global _EMBEDDING_GLOBAL_RANKS
-    assert _EMBEDDING_GROUP is None, 'embedding group is already initialized'
-    global _POSITION_EMBEDDING_GROUP
-    global _POSITION_EMBEDDING_GLOBAL_RANKS
-    assert _POSITION_EMBEDDING_GROUP is None, \
-        'position embedding group is already initialized'
-    for i in range(num_pipeline_model_parallel_groups):
-        ranks = range(i, world_size, num_pipeline_model_parallel_groups)
-        group = torch.distributed.new_group(ranks)
-        if rank in ranks:
-            _PIPELINE_MODEL_PARALLEL_GROUP = group
-            _PIPELINE_GLOBAL_RANKS = ranks
-        # Setup embedding group (to exchange gradients between
-        # first and last stages).
-        if len(ranks) > 1:
-            embedding_ranks = [ranks[0], ranks[-1]]
-            position_embedding_ranks = [ranks[0]]
-            if pipeline_model_parallel_split_rank is not None:
-                if ranks[pipeline_model_parallel_split_rank] not in embedding_ranks:
-                    embedding_ranks = [ranks[0],
-                                       ranks[pipeline_model_parallel_split_rank],
-                                       ranks[-1]]
-                if ranks[pipeline_model_parallel_split_rank] not in position_embedding_ranks:
-                    position_embedding_ranks = [ranks[0],
-                                       ranks[pipeline_model_parallel_split_rank]]
-        else:
-            embedding_ranks = ranks
-            position_embedding_ranks = ranks
-
-        group = torch.distributed.new_group(embedding_ranks)
-        if rank in embedding_ranks:
-            _EMBEDDING_GROUP = group
-        if rank in ranks:
-            _EMBEDDING_GLOBAL_RANKS = embedding_ranks
-
-        group = torch.distributed.new_group(position_embedding_ranks)
-        if rank in position_embedding_ranks:
-            _POSITION_EMBEDDING_GROUP = group
-        if rank in ranks:
-            _POSITION_EMBEDDING_GLOBAL_RANKS = position_embedding_ranks
-
-    # Initialize global memory buffer
-    # This isn't really "parallel state" but there isn't another good place to
-    # put this. If we end up with a more generic initialization of megatron-core
-    # we could stick it there
-    _set_global_memory_buffer()
-
-
-def initialize_all_reduce_launcher(
-    max_num_tokens: int,
-    hidden_size: int,
-    dtype: torch.dtype,
-    disable_graph: bool = False,
-) -> None:
-    global _ALL_REDUCE_LAUNCHER
-    _ALL_REDUCE_LAUNCHER = GraphAllReduce(
-        max_num_tokens=max_num_tokens,
-        hidden_size=hidden_size,
-        dtype=dtype,
-        disable_graph=disable_graph,
-    )
-
-def model_parallel_is_initialized():
-    """Check if model and data parallel groups are initialized."""
-    if _TENSOR_MODEL_PARALLEL_GROUP is None or \
-        _PIPELINE_MODEL_PARALLEL_GROUP is None or \
-        _DATA_PARALLEL_GROUP is None:
-        return False
-    return True
-
-
-def get_model_parallel_group():
-    """Get the model parallel group the caller rank belongs to."""
-    assert _MODEL_PARALLEL_GROUP is not None, \
-        'model parallel group is not initialized'
-    return _MODEL_PARALLEL_GROUP
-
-
-def get_tensor_model_parallel_group():
-    """Get the tensor model parallel group the caller rank belongs to."""
-    assert _TENSOR_MODEL_PARALLEL_GROUP is not None, \
-        'intra_layer_model parallel group is not initialized'
-    return _TENSOR_MODEL_PARALLEL_GROUP
-
-
-def get_pipeline_model_parallel_group():
-    """Get the pipeline model parallel group the caller rank belongs to."""
-    assert _PIPELINE_MODEL_PARALLEL_GROUP is not None, \
-        'pipeline_model parallel group is not initialized'
-    return _PIPELINE_MODEL_PARALLEL_GROUP
-
-
-def get_data_parallel_group():
-    """Get the data parallel group the caller rank belongs to."""
-    assert _DATA_PARALLEL_GROUP is not None, \
-        'data parallel group is not initialized'
-    return _DATA_PARALLEL_GROUP
-
-
-def get_embedding_group():
-    """Get the embedding group the caller rank belongs to."""
-    assert _EMBEDDING_GROUP is not None, \
-        'embedding group is not initialized'
-    return _EMBEDDING_GROUP
-
-
-def get_position_embedding_group():
-    """Get the position embedding group the caller rank belongs to."""
-    assert _POSITION_EMBEDDING_GROUP is not None, \
-        'position embedding group is not initialized'
-    return _POSITION_EMBEDDING_GROUP
-
-
-def set_tensor_model_parallel_world_size(world_size):
-    """Set the tensor model parallel size"""
-    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
-    _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = world_size
-
-
-def set_pipeline_model_parallel_world_size(world_size):
-    """Set the pipeline model parallel size"""
-    global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
-    _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = world_size
-
-
-def get_tensor_model_parallel_world_size():
-    """Return world size for the tensor model parallel group."""
-    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
-    if _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE is not None:
-        return _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
-    return torch.distributed.get_world_size(group=get_tensor_model_parallel_group())
-
-
-def get_pipeline_model_parallel_world_size():
-    """Return world size for the pipeline model parallel group."""
-    global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
-    if _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE is not None:
-        return _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
-    return torch.distributed.get_world_size(group=get_pipeline_model_parallel_group())
-
-
-def set_tensor_model_parallel_rank(rank):
-    """Set tensor model parallel rank."""
-    global _MPU_TENSOR_MODEL_PARALLEL_RANK
-    _MPU_TENSOR_MODEL_PARALLEL_RANK = rank
-
-
-def set_pipeline_model_parallel_rank(rank):
-    """Set pipeline model parallel rank."""
-    global _MPU_PIPELINE_MODEL_PARALLEL_RANK
-    _MPU_PIPELINE_MODEL_PARALLEL_RANK = rank
-
-
-def set_pipeline_model_parallel_split_rank(rank):
-    """Set pipeline model parallel split rank."""
-    global _MPU_PIPELINE_MODEL_PARALLEL_SPLIT_RANK
-    _MPU_PIPELINE_MODEL_PARALLEL_SPLIT_RANK = rank
-
-
-def get_tensor_model_parallel_rank():
-    """Return my rank for the tensor model parallel group."""
-    global _MPU_TENSOR_MODEL_PARALLEL_RANK
-    if _MPU_TENSOR_MODEL_PARALLEL_RANK is not None:
-        return _MPU_TENSOR_MODEL_PARALLEL_RANK
-    return torch.distributed.get_rank(group=get_tensor_model_parallel_group())
-
-
-def get_pipeline_model_parallel_rank():
-    """Return my rank for the pipeline model parallel group."""
-    global _MPU_PIPELINE_MODEL_PARALLEL_RANK
-    if _MPU_PIPELINE_MODEL_PARALLEL_RANK is not None:
-        return _MPU_PIPELINE_MODEL_PARALLEL_RANK
-    return torch.distributed.get_rank(group=get_pipeline_model_parallel_group())
-
-
-
-def is_pipeline_first_stage(ignore_virtual=False):
-    """Return True if in the first pipeline model-parallel stage, False otherwise."""
-    if not ignore_virtual:
-        if get_virtual_pipeline_model_parallel_world_size() is not None and \
-            get_virtual_pipeline_model_parallel_rank() != 0:
-            return False
-    return get_pipeline_model_parallel_rank() == 0
-
-
-def is_pipeline_last_stage(ignore_virtual=False):
-    """Return True if in the last pipeline model-parallel stage, False otherwise."""
-    if not ignore_virtual:
-        virtual_pipeline_model_parallel_world_size = \
-            get_virtual_pipeline_model_parallel_world_size()
-        if virtual_pipeline_model_parallel_world_size is not None and \
-            get_virtual_pipeline_model_parallel_rank() != (
-                virtual_pipeline_model_parallel_world_size - 1):
-            return False
-    return get_pipeline_model_parallel_rank() == (
-        get_pipeline_model_parallel_world_size() - 1)
-
-
-def is_rank_in_embedding_group(ignore_virtual=False):
-    """Return true if current rank is in embedding group, False otherwise."""
-    rank = torch.distributed.get_rank()
-    global _EMBEDDING_GLOBAL_RANKS
-    if ignore_virtual:
-        return rank in _EMBEDDING_GLOBAL_RANKS
-    if rank in _EMBEDDING_GLOBAL_RANKS:
-        if rank == _EMBEDDING_GLOBAL_RANKS[0]:
-            return is_pipeline_first_stage(ignore_virtual=False)
-        elif rank == _EMBEDDING_GLOBAL_RANKS[-1]:
-            return is_pipeline_last_stage(ignore_virtual=False)
-        else:
-            return True
-    return False
-
-
-def is_rank_in_position_embedding_group():
-    """Return true if current rank is in position embedding group, False otherwise."""
-    rank = torch.distributed.get_rank()
-    global _POSITION_EMBEDDING_GLOBAL_RANKS
-    return rank in _POSITION_EMBEDDING_GLOBAL_RANKS
-
-
-def is_pipeline_stage_before_split(rank=None):
-    """Return True if pipeline stage executes encoder block for a model
-    with both encoder and decoder."""
-    if get_pipeline_model_parallel_world_size() == 1:
-        return True
-    if rank is None:
-        rank = get_pipeline_model_parallel_rank()
-    global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
-    if _PIPELINE_MODEL_PARALLEL_SPLIT_RANK is None:
-        return True
-    if rank < _PIPELINE_MODEL_PARALLEL_SPLIT_RANK:
-        return True
-    return False
-
-
-def is_pipeline_stage_after_split(rank=None):
-    """Return True if pipeline stage executes decoder block for a model
-    with both encoder and decoder."""
-    if get_pipeline_model_parallel_world_size() == 1:
-        return True
-    if rank is None:
-        rank = get_pipeline_model_parallel_rank()
-    global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
-    if _PIPELINE_MODEL_PARALLEL_SPLIT_RANK is None:
-        return True
-    if rank >= _PIPELINE_MODEL_PARALLEL_SPLIT_RANK:
-        return True
-    return False
-
-
-def is_pipeline_stage_at_split():
-    """Return true if pipeline stage executes decoder block and next
-    stage executes encoder block for a model with both encoder and
-    decoder."""
-    rank = get_pipeline_model_parallel_rank()
-    return is_pipeline_stage_before_split(rank) and \
-            is_pipeline_stage_after_split(rank+1)
-
-
-def get_virtual_pipeline_model_parallel_rank():
-    """Return the virtual pipeline-parallel rank."""
-    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
-    return _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
-
-
-def set_virtual_pipeline_model_parallel_rank(rank):
-    """Set the virtual pipeline-parallel rank."""
-    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
-    _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = rank
-
-
-def get_virtual_pipeline_model_parallel_world_size():
-    """Return the virtual pipeline-parallel world size."""
-    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
-    return _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
-
-
-def get_tensor_model_parallel_src_rank():
-    """Calculate the global rank corresponding to the first local rank
-    in the tensor model parallel group."""
-    global_rank = torch.distributed.get_rank()
-    local_world_size = get_tensor_model_parallel_world_size()
-    return (global_rank // local_world_size) * local_world_size
-
-
-def get_data_parallel_src_rank():
-    """Calculate the global rank corresponding to the first local rank
-    in the data parallel group."""
-    assert _DATA_PARALLEL_GLOBAL_RANKS is not None, \
-        "Data parallel group is not initialized"
-    return _DATA_PARALLEL_GLOBAL_RANKS[0]
-
-
-def get_pipeline_model_parallel_first_rank():
-    """Return the global rank of the first process in the pipeline for the
-    current tensor parallel group"""
-    assert _PIPELINE_GLOBAL_RANKS is not None, \
-        "Pipeline parallel group is not initialized"
-    return _PIPELINE_GLOBAL_RANKS[0]
-
-
-def get_pipeline_model_parallel_last_rank():
-    """Return the global rank of the last process in the pipeline for the
-    current tensor parallel group"""
-    assert _PIPELINE_GLOBAL_RANKS is not None, \
-        "Pipeline parallel group is not initialized"
-    last_rank_local = get_pipeline_model_parallel_world_size() - 1
-    return _PIPELINE_GLOBAL_RANKS[last_rank_local]
-
-def get_pipeline_model_parallel_next_rank():
-    """Return the global rank that follows the caller in the pipeline"""
-    assert _PIPELINE_GLOBAL_RANKS is not None, \
-        "Pipeline parallel group is not initialized"
-    rank_in_pipeline = get_pipeline_model_parallel_rank()
-    world_size = get_pipeline_model_parallel_world_size()
-    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline + 1) % world_size]
-
-
-def get_pipeline_model_parallel_prev_rank():
-    """Return the global rank that preceeds the caller in the pipeline"""
-    assert _PIPELINE_GLOBAL_RANKS is not None, \
-        "Pipeline parallel group is not initialized"
-    rank_in_pipeline = get_pipeline_model_parallel_rank()
-    world_size = get_pipeline_model_parallel_world_size()
-    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline - 1) % world_size]
-
-
-def get_data_parallel_world_size():
-    """Return world size for the data parallel group."""
-    return torch.distributed.get_world_size(group=get_data_parallel_group())
-
-
-def get_data_parallel_rank():
-    """Return my rank for the data parallel group."""
-    return torch.distributed.get_rank(group=get_data_parallel_group())
-
-def _set_global_memory_buffer():
-    """Initialize global buffer"""
-    global _GLOBAL_MEMORY_BUFFER
-    assert _GLOBAL_MEMORY_BUFFER is None, 'global memory buffer is already initialized'
-    _GLOBAL_MEMORY_BUFFER = GlobalMemoryBuffer()
-
-def get_global_memory_buffer():
-    """Return the global GlobalMemoryBuffer object"""
-    assert _GLOBAL_MEMORY_BUFFER is not None, 'global memory buffer is not initialized'
-    return _GLOBAL_MEMORY_BUFFER
-
-def get_all_reduce_launcher() -> 'GraphAllReduce':
-    assert _ALL_REDUCE_LAUNCHER is not None, 'all reduce launcher is not initialized'
-    return _ALL_REDUCE_LAUNCHER
-
-def destroy_model_parallel():
-    """Set the groups to none."""
-    global _MODEL_PARALLEL_GROUP
-    _MODEL_PARALLEL_GROUP = None
-    global _TENSOR_MODEL_PARALLEL_GROUP
-    _TENSOR_MODEL_PARALLEL_GROUP = None
-    global _PIPELINE_MODEL_PARALLEL_GROUP
-    _PIPELINE_MODEL_PARALLEL_GROUP = None
-    global _DATA_PARALLEL_GROUP
-    _DATA_PARALLEL_GROUP = None
-    global _EMBEDDING_GROUP
-    _EMBEDDING_GROUP = None
-    global _POSITION_EMBEDDING_GROUP
-    _POSITION_EMBEDDING_GROUP = None
-    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
-    _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = None
-    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
-    _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
-    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
-    _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = None
-    global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
-    _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
-    global _MPU_TENSOR_MODEL_PARALLEL_RANK
-    _MPU_TENSOR_MODEL_PARALLEL_RANK = None
-    global _MPU_PIPELINE_MODEL_PARALLEL_RANK
-    _MPU_PIPELINE_MODEL_PARALLEL_RANK = None
-    global _GLOBAL_MEMORY_BUFFER
-    _GLOBAL_MEMORY_BUFFER = None
-
-
-class GraphAllReduce:
-
-    def __init__(
-        self,
-        max_num_tokens: int,
-        hidden_size: int,
-        dtype: torch.dtype,
-        disable_graph: bool = False,
-    ) -> None:
-        self.max_num_tokens = max_num_tokens
-        self.hidden_size = hidden_size
-        self.disable_graph = disable_graph
-
-        tp_world_size = get_tensor_model_parallel_world_size()
-        if tp_world_size == 1:
-            return
-
-        self.group = get_tensor_model_parallel_group()
-        self.buffer = torch.empty(
-            size=(max_num_tokens, hidden_size),
-            dtype=dtype,
-            device='cuda',
-        )
-
-        # Build graphs for different number of tokens.
-        if not self.disable_graph:
-            self.graphs = {}
-            for num_tokens in range(8, max_num_tokens + 1, 8):
-                self.graphs[num_tokens] = self._build_graph(num_tokens)
-
-    def _build_graph(self, num_tokens: int) -> torch.cuda.CUDAGraph:
-        # Warm up.
-        torch.distributed.all_reduce(self.buffer[:num_tokens], group=self.group)
-        torch.cuda.synchronize()
-
-        # Build graph.
-        graph = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(graph):
-            torch.distributed.all_reduce(
-                self.buffer[:num_tokens], group=self.group)
-        torch.cuda.synchronize()
-        return graph
-
-    def launch(self, x: torch.Tensor) -> torch.Tensor:
-        # NOTE: x must be a slice of self.buffer.
-        num_tokens = x.shape[0]
-        if self.disable_graph:
-            torch.distributed.all_reduce(x, group=self.group)
-        else:
-            self.graphs[num_tokens].replay()
-        return x
--- a/cacheflow/parallel_utils/tensor_parallel/init.py
+++ b/cacheflow/parallel_utils/tensor_parallel/init.py
@@ -1,55 +0,0 @@
-from .layers import (
-    ColumnParallelLinear,
-    RowParallelLinear,
-    VocabParallelEmbedding,
-    set_tensor_model_parallel_attributes,
-    set_defaults_if_not_set_tensor_model_parallel_attributes,
-    copy_tensor_model_parallel_attributes,
-    param_is_not_tensor_parallel_duplicate,
-)
-
-from .mappings import (
-    copy_to_tensor_model_parallel_region,
-    gather_from_tensor_model_parallel_region,
-    gather_from_sequence_parallel_region,
-    scatter_to_tensor_model_parallel_region,
-    scatter_to_sequence_parallel_region,
-)
-
-from .random import (
-    checkpoint,
-    get_cuda_rng_tracker,
-    model_parallel_cuda_manual_seed,
-)
-
-from .utils import (
-    split_tensor_along_last_dim,
-    split_tensor_into_1d_equal_chunks,
-    gather_split_1d_tensor,
-)
-
-__all__ = [
-    #layers.py
-    "ColumnParallelLinear",
-    "RowParallelLinear",
-    "VocabParallelEmbedding",
-    "set_tensor_model_parallel_attributes",
-    "set_defaults_if_not_set_tensor_model_parallel_attributes",
-    "copy_tensor_model_parallel_attributes",
-    "param_is_not_tensor_parallel_duplicate",
-    # mappings.py
-    "copy_to_tensor_model_parallel_region",
-    "gather_from_tensor_model_parallel_region",
-    "gather_from_sequence_parallel_region",
-#    "reduce_from_tensor_model_parallel_region",
-    "scatter_to_tensor_model_parallel_region",
-    "scatter_to_sequence_parallel_region",
-    # random.py
-    "checkpoint",
-    "get_cuda_rng_tracker",
-    "model_parallel_cuda_manual_seed",
-    # utils.py
-    "split_tensor_along_last_dim",
-    "split_tensor_into_1d_equal_chunks",
-    "gather_split_1d_tensor",
-]
--- a/cacheflow/parallel_utils/tensor_parallel/layers.py
+++ b/cacheflow/parallel_utils/tensor_parallel/layers.py
@@ -1,446 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-# Parts of the code here are adapted from PyTorch
-# repo: https://github.com/pytorch/pytorch
-
-
-import torch
-import torch.nn.functional as F
-import torch.nn.init as init
-from torch.nn.parameter import Parameter
-
-from cacheflow.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-    get_all_reduce_launcher,
-)
-from .mappings import (
-    copy_to_tensor_model_parallel_region,
-    gather_from_tensor_model_parallel_region,
-    reduce_from_tensor_model_parallel_region,
-    scatter_to_tensor_model_parallel_region,
-)
-
-from .random import get_cuda_rng_tracker
-from .utils import (
-    divide,
-    VocabUtility,
-)
-
-_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {'tensor_model_parallel': False,
-                                      'partition_dim': -1,
-                                      'partition_stride': 1}
-
-def param_is_not_tensor_parallel_duplicate(param):
-    return (hasattr(param, 'tensor_model_parallel') and
-            param.tensor_model_parallel) or (
-                get_tensor_model_parallel_rank() == 0)
-
-
-def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride):
-    # Make sure the attributes are not set.
-    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
-        assert not hasattr(tensor, attribute)
-    # Set the attributes.
-    setattr(tensor, 'tensor_model_parallel', is_parallel)
-    setattr(tensor, 'partition_dim', dim)
-    setattr(tensor, 'partition_stride', stride)
-
-
-def set_defaults_if_not_set_tensor_model_parallel_attributes(tensor):
-    def maybe_set(attribute, value):
-        if not hasattr(tensor, attribute):
-            setattr(tensor, attribute, value)
-    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
-        maybe_set(attribute, _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS[attribute])
-
-
-def copy_tensor_model_parallel_attributes(destination_tensor, source_tensor):
-    def maybe_copy(attribute):
-        if hasattr(source_tensor, attribute):
-            setattr(destination_tensor, attribute,
-                    getattr(source_tensor, attribute))
-    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
-        maybe_copy(attribute)
-
-
-def _initialize_affine_weight_gpu(weight, init_method,
-                                  partition_dim, stride=1):
-    """Initialize affine weight for model parallel on GPU."""
-
-    set_tensor_model_parallel_attributes(tensor=weight,
-                                         is_parallel=True,
-                                         dim=partition_dim,
-                                         stride=stride)
-
-    with get_cuda_rng_tracker().fork():
-        init_method(weight)
-
-
-def _initialize_affine_weight_cpu(weight, output_size, input_size,
-                                  per_partition_size, partition_dim,
-                                  init_method, stride=1,
-                                  return_master_weight=False,
-                                  *, params_dtype=None):
-    """Initialize affine weight for model parallel.
-
-    Build the master weight on all processes and scatter
-    the relevant chunk."""
-
-    set_tensor_model_parallel_attributes(tensor=weight,
-                                         is_parallel=True,
-                                         dim=partition_dim,
-                                         stride=stride)
-
-    if params_dtype is None:
-        params_dtype = torch.get_default_dtype()
-
-    # Initialize master weight
-    master_weight = torch.empty(output_size, input_size,
-                                dtype=torch.float,
-                                requires_grad=False)
-    init_method(master_weight)
-    master_weight = master_weight.to(dtype=params_dtype)
-
-    # Split and copy
-    per_partition_per_stride_size = divide(per_partition_size, stride)
-    weight_list = torch.split(master_weight, per_partition_per_stride_size,
-                              dim=partition_dim)
-    rank = get_tensor_model_parallel_rank()
-    world_size = get_tensor_model_parallel_world_size()
-    my_weight_list = weight_list[rank::world_size]
-
-    with torch.no_grad():
-        torch.cat(my_weight_list, dim=partition_dim, out=weight)
-    if return_master_weight:
-        return master_weight
-    return None
-
-
-class VocabParallelEmbedding(torch.nn.Module):
-    """Embedding parallelized in the vocabulary dimension.
-
-    This is mainly adapted from torch.nn.Embedding and all the default
-    values are kept.
-    Arguments:
-        num_embeddings: vocabulary size.
-        embedding_dim: size of hidden state.
-
-    Keyword Arguments:
-        init_method: method to initialize weights.
-        params_dtype
-        use_cpu_initialization
-        perform_initialization
-    """
-
-    def __init__(self, num_embeddings: int, embedding_dim: int, *,
-                 init_method=init.xavier_normal_,
-                 params_dtype: torch.dtype=None,
-                 use_cpu_initialization: bool=False,
-                 perform_initialization: bool=True):
-        super(VocabParallelEmbedding, self).__init__()
-        # Keep the input dimensions.
-        self.num_embeddings = num_embeddings
-        self.embedding_dim = embedding_dim
-        if params_dtype is None:
-            params_dtype = torch.get_default_dtype()
-
-        # Set the defaults for compatibility.
-        self.padding_idx = None
-        self.max_norm = None
-        self.norm_type = 2.
-        self.scale_grad_by_freq = False
-        self.sparse = False
-        self._weight = None
-        self.tensor_model_parallel_size = get_tensor_model_parallel_world_size()
-        # Divide the weight matrix along the vocaburaly dimension.
-        self.vocab_start_index, self.vocab_end_index = \
-            VocabUtility.vocab_range_from_global_vocab_size(
-                self.num_embeddings, get_tensor_model_parallel_rank(),
-                self.tensor_model_parallel_size)
-        self.num_embeddings_per_partition = self.vocab_end_index - \
-            self.vocab_start_index
-
-        # Allocate weights and initialize.
-        if use_cpu_initialization:
-            self.weight = Parameter(torch.empty(
-                self.num_embeddings_per_partition, self.embedding_dim,
-                dtype=params_dtype))
-            if perform_initialization:
-                _initialize_affine_weight_cpu(
-                    self.weight, self.num_embeddings, self.embedding_dim,
-                    self.num_embeddings_per_partition, 0, init_method,
-                    params_dtype=params_dtype)
-        else:
-            self.weight = Parameter(torch.empty(
-                self.num_embeddings_per_partition, self.embedding_dim,
-                device=torch.cuda.current_device(), dtype=params_dtype))
-            if perform_initialization:
-                _initialize_affine_weight_gpu(self.weight, init_method,
-                                              partition_dim=0, stride=1)
-
-    def forward(self, input_):
-        if self.tensor_model_parallel_size > 1:
-            # Build the mask.
-            input_mask = (input_ < self.vocab_start_index) | \
-                         (input_ >= self.vocab_end_index)
-            # Mask the input.
-            masked_input = input_.clone() - self.vocab_start_index
-            masked_input[input_mask] = 0
-        else:
-            masked_input = input_
-            # Get the embeddings.
-        output_parallel = F.embedding(masked_input, self.weight,
-                                      self.padding_idx, self.max_norm,
-                                      self.norm_type, self.scale_grad_by_freq,
-                                      self.sparse)
-        # Mask the output embedding.
-        if self.tensor_model_parallel_size > 1:
-            output_parallel[input_mask, :] = 0.0
-        # Reduce across all the model parallel GPUs.
-        output = reduce_from_tensor_model_parallel_region(output_parallel)
-        return output
-
-
-class ColumnParallelLinear(torch.nn.Module):
-    """Linear layer with column parallelism.
-
-    The linear layer is defined as Y = XA + b. A is parallelized along
-    its second dimension as A = [A_1, ..., A_p].
-
-    Arguments:
-        input_size: first dimension of matrix A.
-        output_size: second dimension of matrix A.
-
-    Keyword Arguments
-        bias: If true, add bias
-        gather_output: If true, call all-gather on output and make Y available
-                       to all GPUs, otherwise, every GPU will have its output
-                       which is Y_i = XA_i
-        init_method: method to initialize weights. Note that bias is always set
-                     to zero.
-        stride: For the strided linear layers.
-        keep_master_weight_for_test: This was added for testing and should be
-                                     set to False. It returns the master weights
-                                     used for initialization.
-        skip_bias_add: This was added to enable performance optimations where bias
-                       can be fused with other elementwise operations. we skip
-                       adding bias but instead return it.
-        params_dtype:
-        use_cpu_initialization:
-    """
-
-    def __init__(self, input_size, output_size, *,
-                 bias=True, gather_output=True,
-                 init_method=init.xavier_normal_, stride=1,
-                 keep_master_weight_for_test=False,
-                 skip_bias_add=False,
-                 params_dtype=None,
-                 use_cpu_initialization=False,
-                 perform_initialization=True,
-                 ):
-        super(ColumnParallelLinear, self).__init__()
-
-        # Keep input parameters
-        self.input_size = input_size
-        self.output_size = output_size
-        self.gather_output = gather_output
-        # Divide the weight matrix along the last dimension.
-        world_size = get_tensor_model_parallel_world_size()
-        self.output_size_per_partition = divide(output_size, world_size)
-        self.skip_bias_add = skip_bias_add
-
-        if params_dtype is None:
-            params_dtype = torch.get_default_dtype()
-
-        # Parameters.
-        # Note: torch.nn.functional.linear performs XA^T + b and as a result
-        # we allocate the transpose.
-        # Initialize weight.
-        if use_cpu_initialization:
-            self.weight = Parameter(torch.empty(self.output_size_per_partition,
-                                                self.input_size,
-                                                dtype=params_dtype))
-            if perform_initialization:
-                self.master_weight = _initialize_affine_weight_cpu(
-                    self.weight, self.output_size, self.input_size,
-                    self.output_size_per_partition, 0, init_method,
-                    stride=stride, return_master_weight=keep_master_weight_for_test)
-        else:
-            self.weight = Parameter(torch.empty(
-                self.output_size_per_partition, self.input_size,
-                device=torch.cuda.current_device(), dtype=params_dtype))
-            if perform_initialization:
-                _initialize_affine_weight_gpu(self.weight, init_method,
-                                              partition_dim=0, stride=stride)
-
-        if bias:
-            if use_cpu_initialization:
-                self.bias = Parameter(torch.empty(
-                    self.output_size_per_partition, dtype=params_dtype))
-            else:
-                self.bias = Parameter(torch.empty(
-                    self.output_size_per_partition,
-                    device=torch.cuda.current_device(),
-                    dtype=params_dtype))
-            set_tensor_model_parallel_attributes(self.bias, True, 0, stride)
-            # Always initialize bias to zero.
-            with torch.no_grad():
-                self.bias.zero_()
-        else:
-            self.register_parameter('bias', None)
-
-
-    def forward(self, input_):
-        """Forward of ColumnParallelLinear
-
-        Args:
-            input_: 3D tensor whose order of dimension is [sequence, batch, hidden]
-
-        Returns:
-            - output
-            - bias
-        """
-        bias = self.bias if not self.skip_bias_add else None
-
-        input_parallel = copy_to_tensor_model_parallel_region(input_)
-        # Matrix multiply.
-        output_parallel = F.linear(input_parallel, self.weight, bias)
-        if self.gather_output:
-            # All-gather across the partitions.
-            output = gather_from_tensor_model_parallel_region(output_parallel)
-        else:
-            output = output_parallel
-        output_bias = self.bias if self.skip_bias_add else None
-        return output, output_bias
-
-
-class RowParallelLinear(torch.nn.Module):
-    """Linear layer with row parallelism.
-
-    The linear layer is defined as Y = XA + b. A is parallelized along
-    its first dimension and X along its second dimension as:
-               -   -
-              | A_1 |
-              | .   |
-          A = | .   |        X = [X_1, ..., X_p]
-              | .   |
-              | A_p |
-               -   -
-    Arguments:
-        input_size: first dimension of matrix A.
-        output_size: second dimension of matrix A.
-
-    Keyword Arguments:
-        bias: If true, add bias. Note that bias is not parallelized.
-        input_is_parallel: If true, we assume that the input is already
-                           split across the GPUs and we do not split
-                           again.
-        init_method: method to initialize weights. Note that bias is always set
-                     to zero.
-        stride: For the strided linear layers.
-        keep_master_weight_for_test: This was added for testing and should be
-                                     set to False. It returns the master weights
-                                     used for initialization.
-        skip_bias_add: This was added to enable performance optimization where bias
-                       can be fused with other elementwise operations. We skip
-                       adding bias but instead return it.
-        params_dtype:
-        use_cpu_initialization:
-        perform_initialization:
-    """
-
-    def __init__(self, input_size, output_size, *,
-                 bias=True, input_is_parallel=False,
-                 init_method=init.xavier_normal_, stride=1,
-                 keep_master_weight_for_test=False,
-                 skip_bias_add=False,
-                 params_dtype=None,
-                 use_cpu_initialization=False,
-                 perform_initialization=True,
-                 ):
-        super(RowParallelLinear, self).__init__()
-
-        # Keep input parameters
-        self.input_size = input_size
-        self.output_size = output_size
-        self.input_is_parallel = input_is_parallel
-        if params_dtype is None:
-            params_dtype = torch.get_default_dtype()
-
-        # Divide the weight matrix along the last dimension.
-        world_size = get_tensor_model_parallel_world_size()
-        self.input_size_per_partition = divide(input_size, world_size)
-        self.skip_bias_add = skip_bias_add
-
-        # Parameters.
-        # Note: torch.nn.functional.linear performs XA^T + b and as a result
-        # we allocate the transpose.
-        # Initialize weight.
-        if use_cpu_initialization:
-            self.weight = Parameter(torch.empty(self.output_size,
-                                                self.input_size_per_partition,
-                                                dtype=params_dtype))
-            if perform_initialization:
-                self.master_weight = _initialize_affine_weight_cpu(
-                    self.weight, self.output_size, self.input_size,
-                    self.input_size_per_partition, 1, init_method,
-                    stride=stride, return_master_weight=keep_master_weight_for_test,
-                    params_dtype=params_dtype)
-        else:
-            self.weight = Parameter(torch.empty(
-                self.output_size, self.input_size_per_partition,
-                device=torch.cuda.current_device(), dtype=params_dtype))
-            if perform_initialization:
-                _initialize_affine_weight_gpu(self.weight, init_method,
-                                              partition_dim=1, stride=stride)
-        if bias:
-            if use_cpu_initialization:
-                self.bias = Parameter(torch.empty(self.output_size,
-                                                  dtype=params_dtype))
-            else:
-                self.bias = Parameter(torch.empty(
-                    self.output_size, device=torch.cuda.current_device(),
-                    dtype=params_dtype))
-
-            # Always initialize bias to zero.
-            with torch.no_grad():
-                self.bias.zero_()
-        else:
-            self.register_parameter('bias', None)
-        self.weight_t = self.weight.t()
-
-    def forward(self, input_):
-        """Forward of RowParallelLinear
-
-        Args:
-            input_: 3D tensor whose order of dimension is [sequence, batch, hidden]
-
-        Returns:
-            - output
-            - bias
-        """
-        # Set up backprop all-reduce.
-        if self.input_is_parallel:
-            input_parallel = input_
-        else:
-            input_parallel = scatter_to_tensor_model_parallel_region(input_)
-        if get_tensor_model_parallel_world_size() == 1:
-            # Matrix multiply.
-            output_ = F.linear(input_parallel, self.weight)
-        else:
-            # Matrix multiply.
-            all_reduce_launcher = get_all_reduce_launcher()
-            num_tokens = input_parallel.shape[0]
-            output_buffer = all_reduce_launcher.buffer[:num_tokens]
-            torch.matmul(input_parallel, self.weight_t, out=output_buffer)
-            # All-reduce across all the partitions.
-            output_ = all_reduce_launcher.launch(output_buffer)
-
-        if not self.skip_bias_add:
-            output = output_ + self.bias if self.bias is not None else output_
-            output_bias = None
-        else:
-            output = output_
-            output_bias = self.bias
-        return output, output_bias
--- a/cacheflow/parallel_utils/tensor_parallel/mappings.py
+++ b/cacheflow/parallel_utils/tensor_parallel/mappings.py
@@ -1,279 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-import torch
-
-from cacheflow.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-    get_tensor_model_parallel_group,
-)
-from .utils import split_tensor_along_last_dim
-
-
-def _reduce(input_):
-    """All-reduce the input tensor across model parallel group."""
-
-    # Bypass the function if we are using only 1 GPU.
-    if get_tensor_model_parallel_world_size()==1:
-        return input_
-
-    # All-reduce.
-    torch.distributed.all_reduce(input_, group=get_tensor_model_parallel_group())
-
-    return input_
-
-
-def _split_along_last_dim(input_):
-    """Split the tensor along its last dimension and keep the
-    corresponding slice."""
-
-    world_size = get_tensor_model_parallel_world_size()
-    # Bypass the function if we are using only 1 GPU.
-    if world_size == 1:
-        return input_
-
-    # Split along last dimension.
-    input_list = split_tensor_along_last_dim(input_, world_size)
-
-    # Note: torch.split does not create contiguous tensors by default.
-    rank = get_tensor_model_parallel_rank()
-    output = input_list[rank].contiguous()
-
-    return output
-
-
-def _split_along_first_dim(input_):
-    """Split the tensor along its first dimension and keep the
-    corresponding slice."""
-
-    world_size = get_tensor_model_parallel_world_size()
-    # Bypass the function if we are using only 1 GPU.
-    if world_size == 1:
-        return input_
-
-    # Split along first dimension.
-    dim_size = input_.size()[0]
-    assert dim_size % world_size == 0, \
-        "First dimension of the tensor should be divisible by tensor parallel size"
-    local_dim_size = dim_size // world_size
-    rank = get_tensor_model_parallel_rank()
-    dim_offset = rank * local_dim_size
-
-    output = input_[dim_offset:dim_offset+local_dim_size].contiguous()
-
-    return output
-
-
-def _gather_along_last_dim(input_):
-    """Gather tensors and concatinate along the last dimension."""
-
-    world_size = get_tensor_model_parallel_world_size()
-    # Bypass the function if we are using only 1 GPU.
-    if world_size == 1:
-        return input_
-
-    # Size and dimension.
-    last_dim = input_.dim() - 1
-    rank = get_tensor_model_parallel_rank()
-
-    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
-    tensor_list[rank] = input_
-    torch.distributed.all_gather(tensor_list, input_, group=get_tensor_model_parallel_group())
-
-    # Note: torch.cat already creates a contiguous tensor.
-    output = torch.cat(tensor_list, dim=last_dim).contiguous()
-
-    return output
-
-
-def _gather_along_first_dim(input_):
-    """Gather tensors and concatinate along the first dimension."""
-
-    world_size = get_tensor_model_parallel_world_size()
-    # Bypass the function if we are using only 1 GPU.
-    if world_size == 1:
-        return input_
-
-    dim_size = list(input_.size())
-    dim_size[0] = dim_size[0] * world_size
-
-    output = torch.empty(dim_size, dtype=input_.dtype,
-                         device=torch.cuda.current_device())
-    torch.distributed._all_gather_base(output, input_.contiguous(),
-                                       group=get_tensor_model_parallel_group())
-
-    return output
-
-def _reduce_scatter_along_first_dim(input_):
-    """Reduce-scatter the input tensor across model parallel group."""
-    world_size = get_tensor_model_parallel_world_size()
-    # Bypass the function if we are using only 1 GPU.
-    if world_size == 1:
-        return input_
-
-    dim_size = list(input_.size())
-    assert dim_size[0] % world_size == 0, \
-        "First dimension of the tensor should be divisible by tensor parallel size"
-
-    dim_size[0] = dim_size[0] // world_size
-
-    output = torch.empty(dim_size, dtype=input_.dtype,
-                         device=torch.cuda.current_device())
-    torch.distributed._reduce_scatter_base(output, input_.contiguous(),
-                                           group=get_tensor_model_parallel_group())
-    return output
-
-
-class _CopyToModelParallelRegion(torch.autograd.Function):
-    """Pass the input to the model parallel region."""
-
-    @staticmethod
-    def symbolic(graph, input_):
-        return input_
-
-    @staticmethod
-    def forward(ctx, input_):
-        return input_
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return _reduce(grad_output)
-
-
-class _ReduceFromModelParallelRegion(torch.autograd.Function):
-    """All-reduce the input from the model parallel region."""
-
-    @staticmethod
-    def symbolic(graph, input_):
-        return _reduce(input_)
-
-    @staticmethod
-    def forward(ctx, input_):
-        return _reduce(input_)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return grad_output
-
-
-class _ScatterToModelParallelRegion(torch.autograd.Function):
-    """Split the input and keep only the corresponding chuck to the rank."""
-
-    @staticmethod
-    def symbolic(graph, input_):
-        return _split_along_last_dim(input_)
-
-    @staticmethod
-    def forward(ctx, input_):
-        return _split_along_last_dim(input_)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return _gather_along_last_dim(grad_output)
-
-
-class _GatherFromModelParallelRegion(torch.autograd.Function):
-    """Gather the input from model parallel region and concatinate."""
-
-    @staticmethod
-    def symbolic(graph, input_):
-        return _gather_along_last_dim(input_)
-
-    @staticmethod
-    def forward(ctx, input_):
-        return _gather_along_last_dim(input_)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return _split_along_last_dim(grad_output)
-
-
-class _ScatterToSequenceParallelRegion(torch.autograd.Function):
-    """Split the input and keep only the corresponding chuck to the rank."""
-
-    @staticmethod
-    def symbolic(graph, input_):
-        return _split_along_first_dim(input_)
-
-    @staticmethod
-    def forward(ctx, input_):
-        return _split_along_first_dim(input_)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return _gather_along_first_dim(grad_output)
-
-
-class _GatherFromSequenceParallelRegion(torch.autograd.Function):
-    """Gather the input from sequence parallel region and concatinate."""
-
-    @staticmethod
-    def symbolic(graph, input_, tensor_parallel_output_grad=True):
-        return _gather_along_first_dim(input_)
-
-    @staticmethod
-    def forward(ctx, input_, tensor_parallel_output_grad=True):
-        ctx.tensor_parallel_output_grad = tensor_parallel_output_grad
-        return _gather_along_first_dim(input_)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        tensor_parallel_output_grad = ctx.tensor_parallel_output_grad
-
-        # If the computation graph after the gather operation is
-        # in the tensor parallel mode, output gradients need to reduce
-        # scattered and whereas if the computation is duplicated,
-        # output gradients need to be scattered.
-        if tensor_parallel_output_grad:
-            return _reduce_scatter_along_first_dim(grad_output), None
-        else:
-            return _split_along_first_dim(grad_output), None
-
-
-class _ReduceScatterToSequenceParallelRegion(torch.autograd.Function):
-    """Reduce scatter the input from the model parallel region."""
-
-    @staticmethod
-    def symbolic(graph, input_):
-        return _reduce_scatter_along_first_dim(input_)
-
-    @staticmethod
-    def forward(ctx, input_):
-        return _reduce_scatter_along_first_dim(input_)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return _gather_along_first_dim(grad_output)
-
-
-# -----------------
-# Helper functions.
-# -----------------
-
-def copy_to_tensor_model_parallel_region(input_):
-    return _CopyToModelParallelRegion.apply(input_)
-
-
-def reduce_from_tensor_model_parallel_region(input_):
-    return _ReduceFromModelParallelRegion.apply(input_)
-
-
-def scatter_to_tensor_model_parallel_region(input_):
-    return _ScatterToModelParallelRegion.apply(input_)
-
-
-def gather_from_tensor_model_parallel_region(input_):
-    return _GatherFromModelParallelRegion.apply(input_)
-
-
-def scatter_to_sequence_parallel_region(input_):
-    return _ScatterToSequenceParallelRegion.apply(input_)
-
-
-def gather_from_sequence_parallel_region(input_, tensor_parallel_output_grad=True):
-    return _GatherFromSequenceParallelRegion.apply(input_, tensor_parallel_output_grad)
-
-
-def reduce_scatter_to_sequence_parallel_region(input_):
-    return _ReduceScatterToSequenceParallelRegion.apply(input_)
-
--- a/cacheflow/parallel_utils/tensor_parallel/random.py
+++ b/cacheflow/parallel_utils/tensor_parallel/random.py
@@ -1,253 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-# Parts of the code here are adapted from PyTorch
-# repo: https://github.com/pytorch/pytorch
-
-import contextlib
-
-import torch
-from torch import _C
-from torch.cuda import _lazy_call, device as device_ctx_manager
-from torch.utils.checkpoint import detach_variable
-
-from cacheflow.parallel_utils.parallel_state import (
-    get_data_parallel_rank,
-    get_tensor_model_parallel_group,
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-)
-
-from .utils import (
-    split_tensor_into_1d_equal_chunks,
-    gather_split_1d_tensor,
-)
-
-from cacheflow.parallel_utils.utils import safely_set_viewless_tensor_data
-
-# Default name for the model parallel rng tracker.
-_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
-
-
-def _set_cuda_rng_state(new_state, device=-1):
-    """Sets the random number generator state of the current GPU.
-
-    Argumentss:
-        new_state (torch.ByteTensor): The desired state
-    This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
-    with a single change: the input state is not cloned. Cloning caused
-    major performance issues for +4 GPU cases.
-    """
-    if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState):
-        # older PyTorch
-        def cb():
-            with device_ctx_manager(device):
-                _C._cuda_setRNGState(new_state)
-    else:
-        # newer PyTorch
-        if device == -1:
-            device = torch.device('cuda')
-        elif isinstance(device, str):
-            device = torch.device(device)
-        elif isinstance(device, int):
-            device = torch.device('cuda', device)
-
-        def cb():
-            idx = device.index
-            if idx is None:
-                idx = torch.cuda.current_device()
-            default_generator = torch.cuda.default_generators[idx]
-            default_generator.set_state(new_state)
-
-    _lazy_call(cb)
-
-
-
-class CudaRNGStatesTracker:
-    """Tracker for the cuda RNG states.
-
-    Using the `add` method, a cuda rng state is initialized based on
-    the input `seed` and is assigned to `name`. Later, by forking the
-    rng state, we can perform operations and return to our starting
-    cuda state.
-    """
-
-    def __init__(self):
-        # Map from a string name to the cuda rng state.
-        self.states_ = {}
-        # Seeds are just for book keeping and ensure no seed is set twice.
-        self.seeds_ = set()
-
-    def reset(self):
-        """Set to the initial state (no tracker)."""
-        self.states_ = {}
-        self.seeds_ = set()
-
-    def get_states(self):
-        """Get rng states. Copy the dictionary so we have direct
-        pointers to the states, not just a pointer to the dictionary."""
-        states = {}
-        for name in self.states_:
-            states[name] = self.states_[name]
-        return states
-
-    def set_states(self, states):
-        """Set the rng states. For efficiency purposes, we do not check
-        the size of seed for compatibility."""
-        self.states_ = states
-
-    def add(self, name, seed):
-        """Track the rng state."""
-        # Check seed is not already used.
-        if seed in self.seeds_:
-            raise Exception('seed {} already exists'.format(seed))
-        self.seeds_.add(seed)
-        # Check that state is not already defined.
-        if name in self.states_:
-            raise Exception('cuda rng state {} already exists'.format(name))
-        # Get the current rng state.
-        orig_rng_state = torch.cuda.get_rng_state()
-        # Set the new state and store it.
-        torch.cuda.manual_seed(seed)
-        self.states_[name] = torch.cuda.get_rng_state()
-        # Reset rng state to what it was.
-        _set_cuda_rng_state(orig_rng_state)
-
-    @contextlib.contextmanager
-    def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
-        """Fork the cuda rng state, perform operations, and exit with
-        the original state."""
-        # Check if we have added the state
-        if name not in self.states_:
-            raise Exception('cuda rng state {} is not added'.format(name))
-        # Store current rng state.
-        orig_cuda_rng_state = torch.cuda.get_rng_state()
-        # Set rng state to the desired one
-        _set_cuda_rng_state(self.states_[name])
-        # Do the stuff we wanted to do.
-        try:
-            yield
-        finally:
-            # Update the current rng state for later use.
-            self.states_[name] = torch.cuda.get_rng_state()
-            # And set the state to the original state we started with.
-            _set_cuda_rng_state(orig_cuda_rng_state)
-
-
-# RNG tracker object.
-_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()
-
-
-def get_cuda_rng_tracker():
-    """Get cuda rng tracker."""
-    return _CUDA_RNG_STATE_TRACKER
-
-
-def model_parallel_cuda_manual_seed(seed):
-    """Initialize model parallel cuda seed.
-
-    This function should be called after the model parallel is
-    initialized. Also, no torch.cuda.manual_seed should be called
-    after this function. Basically, this is replacement for that
-    function.
-    Two set of RNG states are tracked:
-        default state: This is for data parallelism and is the same among a
-                       set of model parallel GPUs but different across
-                       different model paralle groups. This is used for
-                       example for dropout in the non-tensor-model-parallel regions.
-        tensor-model-parallel state: This state is different among a set of model
-                              parallel GPUs, but the same across data parallel
-                              groups. This is used for example for dropout in
-                              model parallel regions.
-    """
-    # 2718 is just for fun and any POSITIVE value will work.
-    offset = seed + 2718
-    tensor_model_parallel_seed = offset + get_tensor_model_parallel_rank()
-    # Data parallel gets the original seed.
-    data_parallel_seed = seed
-
-    _CUDA_RNG_STATE_TRACKER.reset()
-    # Set the default state.
-    torch.cuda.manual_seed(data_parallel_seed)
-    # and model parallel state.
-    _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME,
-                                tensor_model_parallel_seed)
-
-
-class CheckpointFunction(torch.autograd.Function):
-    """This function is adapted from torch.utils.checkpoint with
-       two main changes:
-           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
-           2) the states in the model parallel tracker are also properly
-              tracked/set/reset.
-    """
-    @staticmethod
-    def forward(ctx, run_function, distribute_saved_activations, *args):
-        ctx.run_function = run_function
-        ctx.distribute_saved_activations \
-            = distribute_saved_activations
-
-        # Copy the rng states.
-        ctx.fwd_cpu_rng_state = torch.get_rng_state()
-        ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
-        ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
-
-        with torch.no_grad():
-            outputs = run_function(*args)
-
-        # Divide hidden states across model parallel group and only keep
-        # the chunk corresponding to the current rank.
-        if distribute_saved_activations:
-            ctx.input_0_shape = args[0].data.shape
-            safely_set_viewless_tensor_data(
-                args[0],
-                split_tensor_into_1d_equal_chunks(args[0].data, new_buffer=True))
-
-        # Store everything.
-        ctx.save_for_backward(*args)
-
-        return outputs
-
-    @staticmethod
-    def backward(ctx, *args):
-        if not torch.autograd._is_checkpoint_valid():
-            raise RuntimeError("Checkpointing is not compatible with .grad(), "
-                               "please use .backward() if possible")
-        inputs = ctx.saved_tensors
-        if ctx.distribute_saved_activations:
-            safely_set_viewless_tensor_data(
-                inputs[0],
-                gather_split_1d_tensor(inputs[0].data).view(ctx.input_0_shape))
-
-        # Store the current states.
-        bwd_cpu_rng_state = torch.get_rng_state()
-        bwd_cuda_rng_state = torch.cuda.get_rng_state()
-        bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
-
-        # Set the states to what it used to be before the forward pass.
-        torch.set_rng_state(ctx.fwd_cpu_rng_state)
-        _set_cuda_rng_state(ctx.fwd_cuda_rng_state)
-        get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker)
-
-        # Compute the forward pass.
-        detached_inputs = detach_variable(inputs)
-        with torch.enable_grad():
-            outputs = ctx.run_function(*detached_inputs)
-
-        # Set the states back to what it was at the start of this function.
-        torch.set_rng_state(bwd_cpu_rng_state)
-        _set_cuda_rng_state(bwd_cuda_rng_state)
-        get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
-
-        if isinstance(outputs, torch.Tensor):
-            outputs = (outputs,)
-        torch.autograd.backward(outputs, args)
-        grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp
-                      for inp in detached_inputs)
-        return (None, None) + grads
-
-
-def checkpoint(function, distribute_saved_activations, *args):
-    """Checkpoint a model or part of the model.
-    This has been directly copied from torch.utils.checkpoint."""
-    return CheckpointFunction.apply(function,
-                                    distribute_saved_activations, *args)
--- a/cacheflow/parallel_utils/tensor_parallel/utils.py
+++ b/cacheflow/parallel_utils/tensor_parallel/utils.py
@@ -1,108 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-import torch
-from typing import List, Sequence
-
-from cacheflow.parallel_utils.utils import divide
-from cacheflow.parallel_utils import parallel_state
-
-def split_tensor_along_last_dim(
-    tensor: torch.Tensor,
-    num_partitions: int,
-    contiguous_split_chunks: bool = False,
-) -> List[torch.Tensor]:
-    """ Split a tensor along its last dimension.
-
-        Arguments:
-            tensor: input tensor.
-            num_partitions: number of partitions to split the tensor
-            contiguous_split_chunks: If True, make each chunk contiguous
-                                     in memory.
-
-        Returns:
-            A list of Tensors
-    """
-    # Get the size and dimension.
-    last_dim = tensor.dim() - 1
-    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
-    # Split.
-    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
-    # Note: torch.split does not create contiguous tensors by default.
-    if contiguous_split_chunks:
-        return tuple(chunk.contiguous() for chunk in tensor_list)
-
-    return tensor_list
-
-def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False):
-    """ Break a tensor into equal 1D chunks across tensor parallel ranks.
-
-        Returns a Tensor or View with this rank's portion of the data.
-
-        Arguments:
-            tensor: The tensor to split
-
-        Keyword Arguments:
-            new_buffer (bool): If True, returns a new Tensor.
-                               If False, returns a view into the existing Tensor.
-                               Default is False
-
-    """
-    partition_size = torch.numel(tensor) // \
-        parallel_state.get_tensor_model_parallel_world_size()
-    start_index = partition_size * parallel_state.get_tensor_model_parallel_rank()
-    end_index = start_index + partition_size
-    if new_buffer:
-        data = torch.empty(partition_size, dtype=tensor.dtype,
-                           device=torch.cuda.current_device(),
-                           requires_grad=False)
-        data.copy_(tensor.view(-1)[start_index:end_index])
-    else:
-        data = tensor.view(-1)[start_index:end_index]
-    return data
-
-
-def gather_split_1d_tensor(tensor):
-    """ Opposite of split_tensor_into_1d_equal_chunks. Gather values from tensor
-        model parallel ranks.
-
-        Returns a new Tensor with the gathered data.
-
-        Arguments:
-            tensor: A Tensor or view of this rank's portion of the data.
-    """
-    numel_gathered = torch.numel(tensor) * \
-        parallel_state.get_tensor_model_parallel_world_size()
-    gathered = torch.empty(numel_gathered, dtype=tensor.dtype,
-                           device=torch.cuda.current_device(),
-                           requires_grad=False)
-    # TODO: This API is experimental in pytorch (as of Feb 2022) and
-    # this might break in future pytorch releases. We chose this API
-    # as opposed to torch.distributed.all_gather for efficiency reasons.
-    # This API calls directly NCCL all-gather versus the former does
-    # internal copies and can potentially cause slow down.
-    torch.distributed._all_gather_base(gathered, tensor,
-                                       group=parallel_state.get_tensor_model_parallel_group())
-    return gathered
-
-
-class VocabUtility:
-    """ Split the vocabulary into `world_size` chunks and return the first
-        and last index of the vocabulary belonging to the `rank`
-        partition: Note that indices in [fist, last)
-
-    """
-
-    @staticmethod
-    def vocab_range_from_per_partition_vocab_size(
-        per_partition_vocab_size: int, rank, world_size: int
-    ) -> Sequence[int]:
-        index_f = rank * per_partition_vocab_size
-        index_l = index_f + per_partition_vocab_size
-        return index_f, index_l
-
-    @staticmethod
-    def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int, world_size: int) -> Sequence[int]:
-        per_partition_vocab_size = divide(global_vocab_size, world_size)
-        return VocabUtility.vocab_range_from_per_partition_vocab_size(
-            per_partition_vocab_size, rank, world_size
-        )
--- a/cacheflow/parallel_utils/utils.py
+++ b/cacheflow/parallel_utils/utils.py
@@ -1,120 +0,0 @@
-"""Utility functions used throughout Megatron core"""
-from functools import reduce
-import operator
-
-import torch
-
-from cacheflow.parallel_utils import parallel_state
-
-
-def ensure_divisibility(numerator, denominator):
-    """Ensure that numerator is divisible by the denominator."""
-    assert numerator % denominator == 0, "{} is not divisible by {}".format(
-        numerator, denominator
-    )
-
-
-def divide(numerator, denominator):
-    """Ensure that numerator is divisible by the denominator and return
-    the division value."""
-    ensure_divisibility(numerator, denominator)
-    return numerator // denominator
-
-
-class GlobalMemoryBuffer:
-    """Global buffer to avoid dynamic memory allocations.
-    Caller should ensure that buffers of the same name
-    are not used concurrently."""
-
-    def __init__(self):
-        self.buffer = {}
-
-    def get_tensor(self, tensor_shape, dtype, name):
-        required_len = reduce(operator.mul, tensor_shape, 1)
-        if self.buffer.get((name, dtype), None) is None or \
-                self.buffer[(name, dtype)].numel() < required_len:
-            self.buffer[(name, dtype)] = \
-                torch.empty(required_len,
-                            dtype=dtype,
-                            device=torch.cuda.current_device(),
-                            requires_grad=False)
-
-        return self.buffer[(name, dtype)][0:required_len].view(*tensor_shape)
-
-def _kernel_make_viewless_tensor(inp, requires_grad):
-    '''Make a viewless tensor.
-
-    View tensors have the undesirable side-affect of retaining a reference
-    to the originally-viewed tensor, even after manually setting the '.data'
-    field. This method creates a new tensor that links to the old tensor's
-    data, without linking the viewed tensor, referenced via the '._base'
-    field.
-    '''
-    out = torch.empty(
-        (1,),
-        dtype = inp.dtype,
-        device = inp.device,
-        requires_grad = requires_grad,
-    )
-    out.data = inp.data
-    return out
-
-class MakeViewlessTensor(torch.autograd.Function):
-    '''
-    Autograd function to make a viewless tensor.
-
-    This function should be used in cases where the computation graph needs
-    to be propagated, but we only want a viewless tensor (e.g.,
-    ParallelTransformer's hidden_states). Call this function by passing
-    'keep_graph = True' to 'make_viewless_tensor()'.
-    '''
-    @staticmethod
-    def forward(ctx, inp, requires_grad):
-        return _kernel_make_viewless_tensor(inp, requires_grad)
-    @staticmethod
-    def backward(ctx, grad_output):
-        return grad_output, None
-
-def make_viewless_tensor(inp, requires_grad, keep_graph):
-    '''
-    Entry-point for creating viewless tensors.
-
-    This method should be used, rather than calling 'MakeViewlessTensor'
-    or '_kernel_make_viewless_tensor' directly. This method acts as a
-    switch for determining if an autograd function or a regular method
-    should be used to create the tensor.
-    '''
-
-    # return tensor as-is, if not a 'view'
-    if inp._base is None:
-        return inp
-
-    # create viewless tensor
-    if keep_graph:
-        return MakeViewlessTensor.apply(inp, requires_grad)
-    else:
-        return _kernel_make_viewless_tensor(inp, requires_grad)
-
-def assert_viewless_tensor(tensor, extra_msg = None):
-    '''Assert that a tensor is not a view (i.e., its '._base' field is
-    not set).'''
-    if isinstance(tensor, list):
-        [ assert_viewless_tensor(t) for t in tensor ]
-        return tensor
-    if not isinstance(tensor, torch.Tensor):
-        return tensor
-    assert tensor._base is None, (
-        "Ensure tensor._base is None before setting tensor.data or storing "
-        "tensor to memory buffer. Otherwise, a memory leak will occur (and "
-        "likely accumulate over iterations). %s"
-    ) % extra_msg
-    return tensor
-
-def safely_set_viewless_tensor_data(tensor, new_data_tensor):
-    '''Safely set tensor's '.data' field.
-
-    Check first that the tensor is viewless (i.e., '._base' not set). If not,
-    raise an exception.
-    '''
-    assert_viewless_tensor(tensor, extra_msg = "FYI, tensor._base has shape %s, and new_data_tensor has shape %s." % ("--" if tensor._base is None else tensor._base.shape, new_data_tensor.shape))
-    tensor.data = new_data_tensor
--- a/cacheflow/sampling_params.py
+++ b/cacheflow/sampling_params.py
@@ -1,84 +0,0 @@
-from typing import Optional, Set, Dict
-
-
-class SamplingParams:
-
-    def __init__(
-        self,
-        n: int,
-        temperature: float,
-        top_p: float,
-        use_beam_search: bool,
-        stop_token_ids: Set[int],
-        max_num_steps: int,
-        num_logprobs: int,
-        context_window_size: Optional[int],
-    ) -> None:
-        if n < 1:
-            raise ValueError(f'n must be at least 1, got {n}.')
-        if temperature < 0.0:
-            raise ValueError(
-                f'temperature must be non-negative, got {temperature}.')
-        if not 0.0 < top_p <= 1.0:
-            raise ValueError(f'top_p must be in (0, 1], got {top_p}.')
-        if max_num_steps < 1:
-            raise ValueError(
-                f'max_num_steps must be at least 1, got {max_num_steps}.')
-        if num_logprobs < 0:
-            raise ValueError(
-                f'num_logprobs must be non-negative, got {num_logprobs}.')
-        if context_window_size is not None and context_window_size < 0:
-            raise ValueError(
-                'context_window_size must be non-negative, '
-                f'got {context_window_size}.')
-
-        if use_beam_search:
-            if n == 1:
-                raise ValueError(
-                    'n must be greater than 1 when using beam search.')
-            if temperature > 0.0:
-                raise ValueError(
-                    'temperature must be 0 when using beam search.')
-            if top_p < 1.0:
-                raise ValueError(
-                    'top_p must be 1 when using beam search.')
-        elif temperature == 0.0:
-            # Zero temperature means greedy sampling.
-            if n > 1:
-                raise ValueError(
-                    'n must be 1 when using greedy sampling.')
-            if top_p < 1.0:
-                raise ValueError(
-                    'top_p must be 1 when using greedy sampling.')
-
-        self.n = n
-        self.temperature = temperature
-        self.top_p = top_p
-        self.use_beam_search = use_beam_search
-        self.stop_token_ids = stop_token_ids
-        self.max_num_steps = max_num_steps
-        self.num_logprobs = num_logprobs
-        self.context_window_size = context_window_size
-
-    def __repr__(self) -> str:
-        return (f'SamplingParams(n={self.n}, '
-                f'temperature={self.temperature}, '
-                f'top_p={self.top_p}, '
-                f'use_beam_search={self.use_beam_search}, '
-                f'stop_token_ids={self.stop_token_ids}, '
-                f'max_num_steps={self.max_num_steps}, '
-                f'num_logprobs={self.num_logprobs}, '
-                f'context_window_size={self.context_window_size})')
-
-    @classmethod
-    def from_dict(cls, d: Dict) -> 'SamplingParams':
-        return cls(
-            n=d.get('n', 1),
-            temperature=d.get('temperature', 1.0),
-            top_p=d.get('top_p', 1.0),
-            use_beam_search=d.get('use_beam_search', False),
-            stop_token_ids=set(d.get('stop_token_ids', set())),
-            max_num_steps=d.get('max_num_steps', 16),
-            num_logprobs=d.get('num_logprobs', 0),
-            context_window_size=d.get('context_window_size', None),
-        )
--- a/cacheflow/sequence.py
+++ b/cacheflow/sequence.py
@@ -1,169 +0,0 @@
-import copy
-import enum
-from typing import Dict, List, Optional
-
-from cacheflow.block import LogicalTokenBlock
-from cacheflow.sampling_params import SamplingParams
-
-
-class SequenceStatus(enum.Enum):
-    WAITING = enum.auto()
-    RUNNING = enum.auto()
-    SWAPPED = enum.auto()
-    FINISHED = enum.auto()
-
-
-class Sequence:
-
-    def __init__(
-        self,
-        seq_id: int,
-        token_ids: List[int],
-        block_size: int,
-    ) -> None:
-        self.seq_id = seq_id
-        self.block_size = block_size
-
-        self.logical_token_blocks: List[LogicalTokenBlock] = []
-        # Initialize the logical token blocks with the given token ids.
-        self.add(token_ids)
-
-        self.prompt_len = len(token_ids)
-        self.status = SequenceStatus.WAITING
-        self.output_logprobs: List[Dict[int, float]] = []
-        self.cumulative_logprobs = 0.0
-
-    def add_block(self) -> None:
-        block = LogicalTokenBlock(
-            block_number=len(self.logical_token_blocks),
-            block_size=self.block_size,
-        )
-        self.logical_token_blocks.append(block)
-
-    def add(self, token_ids: List[int]) -> None:
-        while token_ids:
-            if not self.logical_token_blocks:
-                self.add_block()
-
-            last_block = self.logical_token_blocks[-1]
-            if last_block.is_full():
-                self.add_block()
-                last_block = self.logical_token_blocks[-1]
-
-            num_empty_slots = last_block.get_num_empty_slots()
-            last_block.append(token_ids[:num_empty_slots])
-            token_ids = token_ids[num_empty_slots:]
-
-    def append(self, token_id: int, logprobs: Dict[int, float]) -> None:
-        assert token_id in logprobs
-        self.add([token_id])
-        self.output_logprobs.append(logprobs)
-        self.cumulative_logprobs += logprobs[token_id]
-
-    def get_len(self) -> int:
-        return sum(block.num_tokens for block in self.logical_token_blocks)
-
-    def get_token_ids(self) -> List[int]:
-        token_ids: List[int] = []
-        for block in self.logical_token_blocks:
-            token_ids.extend(block.get_token_ids())
-        return token_ids
-
-    def get_last_token_id(self) -> int:
-        return self.logical_token_blocks[-1].get_last_token_id()
-
-    def fork(self, child_seq: 'Sequence') -> 'Sequence':
-        child_seq.logical_token_blocks = copy.deepcopy(self.logical_token_blocks)
-        child_seq.output_logprobs = copy.deepcopy(self.output_logprobs)
-        child_seq.cumulative_logprobs = self.cumulative_logprobs
-
-    def __repr__(self) -> str:
-        return (f'Sequence(seq_id={self.seq_id}, '
-                f'status={self.status.name}, '
-                f'num_blocks={len(self.logical_token_blocks)})')
-
-
-class SequenceGroup:
-
-    def __init__(
-        self,
-        group_id: int,
-        seqs: List[Sequence],
-        arrival_time: float,
-    ) -> None:
-        self.group_id = group_id
-        self.seqs = seqs
-        self.arrival_time = arrival_time
-
-    def get_seqs(
-        self,
-        status: Optional[SequenceStatus] = None,
-    ) -> List[Sequence]:
-        if status is None:
-            return self.seqs
-        else:
-            return [seq for seq in self.seqs if seq.status == status]
-
-    def num_seqs(self, status: Optional[SequenceStatus] = None) -> int:
-        return len(self.get_seqs(status))
-
-    def find(self, seq_id: int) -> Sequence:
-        for seq in self.seqs:
-            if seq.seq_id == seq_id:
-                return seq
-        raise ValueError(f'Sequence {seq_id} not found.')
-
-    def is_finished(self) -> bool:
-        return all(seq.status == SequenceStatus.FINISHED for seq in self.seqs)
-
-    def __repr__(self) -> str:
-        return (f'SequenceGroup(group_id={self.group_id}, '
-                f'num_seqs={len(self.seqs)})')
-
-
-class SequenceGroupInputs:
-
-    def __init__(
-        self,
-        group_id: int,
-        is_prompt: bool,
-        input_tokens: Dict[int, List[int]],     # Seq id -> token ids.
-        context_len: int,
-        seq_logprobs: Dict[int, float],         # Seq id -> cumulative logprobs.
-        sampling_params: SamplingParams,
-        block_tables: Dict[int, List[int]],     # Seq id -> List of physical block numbers.
-    ) -> None:
-        self.group_id = group_id
-        self.is_prompt = is_prompt
-        self.input_tokens = input_tokens
-        self.context_len = context_len
-        self.seq_logprobs = seq_logprobs
-        self.sampling_params = sampling_params
-        self.block_tables = block_tables
-
-
-class SequenceOutputs:
-
-    def __init__(
-        self,
-        seq_id: int,
-        parent_seq_id: int,
-        output_token: int,
-        logprobs: Dict[int, float],         # Token id -> logP(x_i+1 | x_0, ..., x_i).
-    ) -> None:
-        self.seq_id = seq_id
-        self.parent_seq_id = parent_seq_id
-        self.output_token = output_token
-        self.logprobs = logprobs
-
-    def __repr__(self) -> str:
-        return (f'SequenceOutputs(seq_id={self.seq_id}, '
-                f'parent_seq_id={self.parent_seq_id}, '
-                f'output_token={self.output_token}), '
-                f'logprobs={self.logprobs}')
-
-    def __eq__(self, other: 'SequenceOutputs') -> bool:
-        return (self.seq_id == other.seq_id and
-                self.parent_seq_id == other.parent_seq_id and
-                self.output_token == other.output_token and
-                self.logprobs == other.logprobs)
--- a/cacheflow/utils.py
+++ b/cacheflow/utils.py
@@ -1,47 +0,0 @@
-import enum
-import random
-import psutil
-
-import numpy as np
-import torch
-
-from cacheflow.parallel_utils.parallel_state import model_parallel_is_initialized
-from cacheflow.parallel_utils.tensor_parallel import model_parallel_cuda_manual_seed
-
-
-class Device(enum.Enum):
-    GPU = enum.auto()
-    CPU = enum.auto()
-
-
-class Counter:
-
-    def __init__(self, start: int = 0) -> None:
-        self.counter = start
-
-    def __next__(self) -> int:
-        id = self.counter
-        self.counter += 1
-        return id
-
-    def reset(self) -> None:
-        self.counter = 0
-
-
-def set_random_seed(seed: int):
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed_all(seed)
-
-    if model_parallel_is_initialized():
-        model_parallel_cuda_manual_seed(seed)
-
-
-def get_gpu_memory(gpu: int = 0) -> int:
-    return torch.cuda.get_device_properties(gpu).total_memory
-
-
-def get_cpu_memory() -> int:
-    return psutil.virtual_memory().total
--- a/cacheflow/worker/cache_engine.py
+++ b/cacheflow/worker/cache_engine.py
@@ -1,127 +0,0 @@
-from typing import Dict, List, Tuple
-
-import torch
-from cacheflow import cache_ops
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
-
-class CacheEngine:
-
-    def __init__(
-        self,
-        worker_id: int,
-        num_layers: int,
-        num_heads: int,
-        head_size: int,
-        block_size: int,
-        num_gpu_blocks: int,
-        num_cpu_blocks: int,
-        dtype: torch.dtype,
-    ) -> None:
-        if head_size % 16 != 0:
-            raise ValueError(
-                f'head_size ({head_size}) must be a multiple of 16.')
-
-        self.worker_id = worker_id
-        self.num_layers = num_layers
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.block_size = block_size
-        self.num_gpu_blocks = num_gpu_blocks
-        self.num_cpu_blocks = num_cpu_blocks
-        self.dtype = dtype
-
-        # Initialize the cache.
-        self.gpu_cache = self.allocate_gpu_cache()
-        self.cpu_cache = self.allocate_cpu_cache()
-
-        # Initialize the stream for caching operations.
-        self.cache_stream = torch.cuda.Stream()
-        assert self.cache_stream != torch.cuda.current_stream()
-        # Initialize the events for stream synchronization.
-        self.events = [torch.cuda.Event() for _ in range(num_layers)]
-
-    def get_key_block_shape(self) -> Tuple[int, int, int, int]:
-        element_size = torch.tensor([], dtype=self.dtype).element_size()
-        x = 16 // element_size
-        return (
-            self.num_heads,
-            self.head_size // x,
-            self.block_size,
-            x,
-        )
-
-    def get_value_block_shape(self) -> Tuple[int, int, int]:
-        return (
-            self.num_heads,
-            self.head_size,
-            self.block_size,
-        )
-
-    def allocate_gpu_cache(self) -> List[KVCache]:
-        gpu_cache: List[KVCache] = []
-        key_block_shape = self.get_key_block_shape()
-        value_block_shape = self.get_value_block_shape()
-        for _ in range(self.num_layers):
-            key_blocks = torch.empty(
-                size=(self.num_gpu_blocks, *key_block_shape),
-                dtype=self.dtype,
-                device="cuda",
-            )
-            value_blocks = torch.empty(
-                size=(self.num_gpu_blocks, *value_block_shape),
-                dtype=self.dtype,
-                device="cuda",
-            )
-            gpu_cache.append((key_blocks, value_blocks))
-        return gpu_cache
-
-    def allocate_cpu_cache(self) -> List[KVCache]:
-        cpu_cache: List[KVCache] = []
-        key_block_shape = self.get_key_block_shape()
-        value_block_shape = self.get_value_block_shape()
-        for _ in range(self.num_layers):
-            key_blocks = torch.empty(
-                size=(self.num_cpu_blocks, *key_block_shape),
-                dtype=self.dtype,
-                pin_memory=True,
-            )
-            value_blocks = torch.empty(
-                size=(self.num_cpu_blocks, *value_block_shape),
-                dtype=self.dtype,
-                pin_memory=True,
-            )
-            cpu_cache.append((key_blocks, value_blocks))
-        return cpu_cache
-
-    def _swap(
-        self,
-        src: List[KVCache],
-        dst: List[KVCache],
-        src_to_dst: Dict[int, int],
-    ) -> None:
-        with torch.cuda.stream(self.cache_stream):
-            for i in range(self.num_layers):
-                src_key_cache, src_value_cache = src[i]
-                dst_key_cache, dst_value_cache = dst[i]
-                # Copy the key blocks.
-                cache_ops.swap_blocks(
-                    src_key_cache, dst_key_cache, src_to_dst)
-                # Copy the value blocks.
-                cache_ops.swap_blocks(
-                    src_value_cache, dst_value_cache, src_to_dst)
-                event = self.events[i]
-                event.record(stream=self.cache_stream)
-
-    def swap_in(self, src_to_dst: Dict[int, int]) -> None:
-        self._swap(self.cpu_cache, self.gpu_cache, src_to_dst)
-
-    def swap_out(self, src_to_dst: Dict[int, int]) -> None:
-        self._swap(self.gpu_cache, self.cpu_cache, src_to_dst)
-
-    def copy(self, src_to_dsts: Dict[int, List[int]]) -> None:
-        key_caches = [key_cache for key_cache, _ in self.gpu_cache]
-        value_caches = [value_cache for _, value_cache in self.gpu_cache]
-        # NOTE(woosuk): This operation implicitly synchronizes the CPU and GPU.
-        cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts)
--- a/cacheflow/worker/controller.py
+++ b/cacheflow/worker/controller.py
@@ -1,101 +0,0 @@
-from typing import Dict, List, Union, Tuple
-
-import ray
-
-from cacheflow.master.scheduler import Scheduler
-from cacheflow.sequence import SequenceGroupInputs
-from cacheflow.worker.worker import Worker
-
-
-DeviceID = Tuple[int, str, int] # rank, node resource (node IP), device id
-
-
-class Controller:
-
-    def __init__(
-        self,
-        stage_id: int,
-        stage_devices: List[DeviceID],
-        world_size: int,
-        tensor_parallel_size: int,
-        pipeline_parallel_size: int,
-        distributed_init_method: str,
-        model_name: str,
-        block_size: int,
-        num_gpu_blocks: int,
-        num_cpu_blocks: int,
-        dtype: str,
-        seed: int,
-        model_path: str,
-        use_dummy_weights: bool,
-        max_num_batched_tokens: int,
-    ) -> None:
-        self.stage_id = stage_id
-        self.stage_devices = stage_devices
-        self.model_name = model_name
-        self.block_size = block_size
-        self.num_gpu_blocks = num_gpu_blocks
-        self.num_cpu_blocks = num_cpu_blocks
-
-        # Which pipeline stage is this node assigned to?
-        self.is_first_stage = stage_id == 0
-        self.is_last_stage = False
-
-        self.workers: List[Worker] = []
-        for rank, node_resource, device_id in stage_devices:
-            worker_cls = ray.remote(num_cpus=0,
-                                    num_gpus=1,
-                                    resources={node_resource: 1e-5})(Worker)
-            worker = worker_cls.remote(
-                model_name=model_name,
-                block_size=block_size,
-                num_gpu_blocks=num_gpu_blocks,
-                num_cpu_blocks=num_cpu_blocks,
-                dtype=dtype,
-                seed=seed,
-                distributed_init_method=distributed_init_method,
-                rank=rank,
-                world_size=world_size,
-                tensor_parallel_size=tensor_parallel_size,
-                pipeline_parallel_size=pipeline_parallel_size,
-                model_path=model_path,
-                use_dummy_weights=use_dummy_weights,
-                max_num_batched_tokens=max_num_batched_tokens,
-            )
-            self.workers.append(worker)
-
-    def set_next(
-        self,
-        next_node: Union['Controller', 'Scheduler'],
-    ) -> None:
-        self.next_node = next_node
-        self.is_last_stage = isinstance(next_node, Scheduler)
-
-    def execute_stage(
-        self,
-        input_seq_groups: List[SequenceGroupInputs],
-        blocks_to_swap_in: Dict[int, int],
-        blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, List[int]],
-    ) -> None:
-        futures = []
-        for worker in self.workers:
-            future = worker.execute_stage.remote(
-                input_seq_groups,
-                blocks_to_swap_in,
-                blocks_to_swap_out,
-                blocks_to_copy,
-            )
-            futures.append(future)
-
-        all_outputs = ray.get(futures)
-        # Make sure all workers have the same results.
-        output = all_outputs[0]
-        for other_output in all_outputs[1:]:
-            assert output == other_output
-
-        if self.is_last_stage:
-            self.next_node.post_step(output)
-        else:
-            # TODO: Support pipeline parallelism.
-            assert False
--- a/Show More
+++ b/Show More
				`@@ -1 +0,0 @@`
				`The files in this folder are ported from [Megatron-LM](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core). We only keep the codes that are used in inference.`