Bump up to v0.2.3 (#1903 )

Fix num_gpus when TP > 1 (#1852 )
chore(examples-docs): upgrade to OpenAI V1 (#1785 )
2023-12-03 12:27:47 -08:00 · 2023-12-03 12:24:30 -08:00 · 2023-12-03 01:11:22 -08:00 · 2023-12-02 22:17:33 -08:00 · 2023-12-02 21:18:40 -08:00 · 2023-12-02 16:37:44 -08:00
159 changed files with 9127 additions and 5942 deletions
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -43,13 +43,14 @@ jobs:
    name: Build Wheel
    runs-on: ${{ matrix.os }}
    needs: release
-    
+
    strategy:
      fail-fast: false
      matrix:
          os: ['ubuntu-20.04']
          python-version: ['3.8', '3.9', '3.10', '3.11']
-          cuda-version: ['11.8'] # Github runner can't build anything older than 11.8
+          pytorch-version: ['2.1.0']
          cuda-version: ['11.8', '12.1']
    steps:
      - name: Checkout
@@ -69,9 +70,9 @@ jobs:
        run: |
          bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
-      - name: Install PyTorch-cu${{ matrix.cuda-version }}
+      - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
        run: |
-          bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
+          bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
      - name: Build wheel
        shell: bash
@@ -81,7 +82,7 @@ jobs:
          asset_name=${wheel_name//"linux"/"manylinux1"}
          echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
          echo "asset_name=${asset_name}" >> $GITHUB_ENV
-      
+
      - name: Upload Release Asset
        uses: actions/upload-release-asset@v1
        env:
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@@ -1,4 +1,4 @@
-name: pylint
+name: ruff
 on:
  # Trigger the workflow on push or pull request,
@@ -11,7 +11,7 @@ on:
      - main
 jobs:
-  pylint:
+  ruff:
    runs-on: ubuntu-latest
    strategy:
      matrix:
@@ -25,7 +25,7 @@ jobs:
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
-        pip install pylint==2.8.2
+        pip install ruff==0.1.5
-    - name: Analysing the code with pylint
+    - name: Analysing the code with ruff
      run: |
-        pylint vllm
+        ruff vllm tests
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@@ -11,5 +11,8 @@ LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
 $python_executable -m pip install wheel packaging
 $python_executable -m pip install -r requirements.txt
 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
 # Build
 $python_executable setup.py bdist_wheel --dist-dir=dist
--- a/.github/workflows/scripts/cuda-install.sh
+++ b/.github/workflows/scripts/cuda-install.sh
@@ -16,3 +16,8 @@ sudo apt clean
 # Test nvcc
 PATH=/usr/local/cuda-$1/bin:${PATH}
 nvcc --version
 # Log gcc, g++, c++ versions
 gcc --version
 g++ --version
 c++ --version
--- a/.github/workflows/scripts/pytorch-install.sh
+++ b/.github/workflows/scripts/pytorch-install.sh
@@ -1,11 +1,12 @@
 #!/bin/bash
 python_executable=python$1
-cuda_version=$2
+pytorch_version=$2
 cuda_version=$3
 # Install torch
 $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
-$python_executable -m pip install torch -f https://download.pytorch.org/whl/cu${cuda_version//./}/torch_stable.html
+$python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}
 # Print version information
 $python_executable --version
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -28,4 +28,4 @@ jobs:
        pip install toml==0.10.2
    - name: Running yapf
      run: |
-        yapf --diff --recursive vllm --exclude 'vllm/model_executor/parallel_utils/**'
+        yapf --diff --recursive vllm tests
--- a/.pylintrc
+++ b/.pylintrc
@@ -1,434 +0,0 @@
 # This Pylint rcfile contains a best-effort configuration to uphold the
 # best-practices and style described in the Google Python style guide:
 #   https://google.github.io/styleguide/pyguide.html
 #
 # Its canonical open-source location is:
 #   https://google.github.io/styleguide/pylintrc
 [MASTER]
 # Files or directories to be skipped. They should be base names, not paths.
 ignore=docs,parallel_utils
 # Files or directories matching the regex patterns are skipped. The regex
 # matches against base names, not paths.
 ignore-patterns=
 # Pickle collected data for later comparisons.
 persistent=no
 # List of plugins (as comma separated values of python modules names) to load,
 # usually to register additional checkers.
 load-plugins=
 # Use multiple processes to speed up Pylint.
 jobs=4
 # Allow loading of arbitrary C extensions. Extensions are imported into the
 # active Python interpreter and may run arbitrary code.
 unsafe-load-any-extension=no
 [MESSAGES CONTROL]
 # Only show warnings with the listed confidence levels. Leave empty to show
 # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
 confidence=
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option
 # multiple time (only on the command line, not in the configuration file where
 # it should appear only once). See also the "--disable" option for examples.
 #enable=
 # Disable the message, report, category or checker with the given id(s). You
 # can either give multiple identifiers separated by comma (,) or put this
 # option multiple times (only on the command line, not in the configuration
 # file where it should appear only once).You can also use "--disable=all" to
 # disable everything first and then reenable specific checks. For example, if
 # you want to run only the similarities checker, you can use "--disable=all
 # --enable=similarities". If you want to run only the classes checker, but have
 # no Warning level messages displayed, use"--disable=all --enable=classes
 # --disable=W"
 disable=abstract-method,
        apply-builtin,
        arguments-differ,
        attribute-defined-outside-init,
        backtick,
        bad-option-value,
        basestring-builtin,
        buffer-builtin,
        c-extension-no-member,
        consider-using-enumerate,
        cmp-builtin,
        cmp-method,
        coerce-builtin,
        coerce-method,
        delslice-method,
        div-method,
        duplicate-code,
        eq-without-hash,
        execfile-builtin,
        file-builtin,
        filter-builtin-not-iterating,
        fixme,
        getslice-method,
        global-statement,
        hex-method,
        idiv-method,
        implicit-str-concat-in-sequence,
        import-error,
        import-self,
        import-star-module-level,
        inconsistent-return-statements,
        input-builtin,
        intern-builtin,
        invalid-str-codec,
        locally-disabled,
        logging-fstring-interpolation,  # added by vLLM
        logging-not-lazy,  # added by vLLM
        long-builtin,
        long-suffix,
        map-builtin-not-iterating,
        misplaced-comparison-constant,
        missing-class-docstring,  # TODO (vLLM): enable
        missing-function-docstring,
        missing-module-docstring,  # TODO (vLLM): enable
        metaclass-assignment,
        next-method-called,
        next-method-defined,
        no-absolute-import,
        no-else-break,
        no-else-continue,
        no-else-raise,
        no-else-return,
        no-init,  # added
        no-member,
        no-name-in-module,
        no-self-use,
        nonzero-method,
        oct-method,
        old-division,
        old-ne-operator,
        old-octal-literal,
        old-raise-syntax,
        parameter-unpacking,
        print-statement,
        raising-string,
        range-builtin-not-iterating,
        raw_input-builtin,
        rdiv-method,
        reduce-builtin,
        relative-import,
        reload-builtin,
        round-builtin,
        setslice-method,
        signature-differs,
        standarderror-builtin,
        suppressed-message,
        sys-max-int,
        too-few-public-methods,
        too-many-ancestors,
        too-many-arguments,
        too-many-boolean-expressions,
        too-many-branches,
        too-many-instance-attributes,
        too-many-locals,
        too-many-nested-blocks,
        too-many-public-methods,
        too-many-return-statements,
        too-many-statements,
        trailing-newlines,
        unichr-builtin,
        unicode-builtin,
        unnecessary-pass,
        unpacking-in-except,
        unspecified-encoding,
        useless-else-on-loop,
        useless-object-inheritance,
        useless-suppression,
        using-cmp-argument,
        wrong-import-order,
        xrange-builtin,
        zip-builtin-not-iterating,
 [REPORTS]
 # Set the output format. Available formats are text, parseable, colorized, msvs
 # (visual studio) and html. You can also give a reporter class, eg
 # mypackage.mymodule.MyReporterClass.
 output-format=text
 # Tells whether to display a full report or only the messages
 reports=no
 # Python expression which should return a note less than 10 (10 is the highest
 # note). You have access to the variables errors warning, statement which
 # respectively contain the number of errors / warnings messages and the total
 # number of statements analyzed. This is used by the global evaluation report
 # (RP0004).
 evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
 # Template used to display messages. This is a python new-style format string
 # used to format the message information. See doc for all details
 #msg-template=
 [BASIC]
 # Good variable names which should always be accepted, separated by a comma
 good-names=main,_
 # Bad variable names which should always be refused, separated by a comma
 bad-names=
 # Colon-delimited sets of names that determine each other's naming style when
 # the name regexes allow several styles.
 name-group=
 # Include a hint for the correct naming format with invalid-name
 include-naming-hint=no
 # List of decorators that produce properties, such as abc.abstractproperty. Add
 # to this list to register other decorators that produce valid properties.
 property-classes=abc.abstractproperty,cached_property.cached_property,cached_property.threaded_cached_property,cached_property.cached_property_with_ttl,cached_property.threaded_cached_property_with_ttl
 # Regular expression matching correct function names
 function-rgx=^(?:(?P<exempt>setUp|tearDown|setUpModule|tearDownModule)|(?P<camel_case>_?[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_?[a-z][a-z0-9_]*))$
 # Regular expression matching correct variable names
 variable-rgx=^[a-z][a-z0-9_]*$
 # Regular expression matching correct constant names
 const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
 # Regular expression matching correct attribute names
 attr-rgx=^_{0,2}[a-z][a-z0-9_]*$
 # Regular expression matching correct argument names
 argument-rgx=^[a-z][a-z0-9_]*$
 # Regular expression matching correct class attribute names
 class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
 # Regular expression matching correct inline iteration names
 inlinevar-rgx=^[a-z][a-z0-9_]*$
 # Regular expression matching correct class names
 class-rgx=^_?[A-Z][a-zA-Z0-9]*$
 # Regular expression matching correct module names
 module-rgx=^(_?[a-z][a-z0-9_]*|__init__)$
 # Regular expression matching correct method names
 method-rgx=(?x)^(?:(?P<exempt>_[a-z0-9_]+__|runTest|setUp|tearDown|setUpTestCase|tearDownTestCase|setupSelf|tearDownClass|setUpClass|(test|assert)_*[A-Z0-9][a-zA-Z0-9_]*|next)|(?P<camel_case>_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P<snake_case>_{0,2}[a-z][a-z0-9_]*))$
 # Regular expression which should only match function or class names that do
 # not require a docstring.
 no-docstring-rgx=(__.*__|main|test.*|.*test|.*Test)$
 # Minimum line length for functions/classes that require docstrings, shorter
 # ones are exempt.
 docstring-min-length=10
 [TYPECHECK]
 # List of decorators that produce context managers, such as
 # contextlib.contextmanager. Add to this list to register other decorators that
 # produce valid context managers.
 contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager
 # Tells whether missing members accessed in mixin class should be ignored. A
 # mixin class is detected if its name ends with "mixin" (case insensitive).
 ignore-mixin-members=yes
 # List of module names for which member attributes should not be checked
 # (useful for modules/projects where namespaces are manipulated during runtime
 # and thus existing member attributes cannot be deduced by static analysis. It
 # supports qualified module names, as well as Unix pattern matching.
 ignored-modules=
 # List of class names for which member attributes should not be checked (useful
 # for classes with dynamically set attributes). This supports the use of
 # qualified names.
 ignored-classes=optparse.Values,thread._local,_thread._local
 # List of members which are set dynamically and missed by pylint inference
 # system, and so shouldn't trigger E1101 when accessed. Python regular
 # expressions are accepted.
 generated-members=
 [FORMAT]
 # Maximum number of characters on a single line.
 max-line-length=80
 # TODO(https://github.com/PyCQA/pylint/issues/3352): Direct pylint to exempt
 # lines made too long by directives to pytype.
 # Regexp for a line that is allowed to be longer than the limit.
 ignore-long-lines=(?x)(
  ^\s*(\#\ )?<?https?://\S+>?$|
  ^\s*(from\s+\S+\s+)?import\s+.+$)
 # Allow the body of an if to be on the same line as the test if there is no
 # else.
 single-line-if-stmt=yes
 # Maximum number of lines in a module
 max-module-lines=99999
 # String used as indentation unit.  The internal Google style guide mandates 2
 # spaces.  Google's externaly-published style guide says 4, consistent with
 # PEP 8.  Here, we use 2 spaces, for conformity with many open-sourced Google
 # projects (like TensorFlow).
 indent-string='    '
 # Number of spaces of indent required inside a hanging  or continued line.
 indent-after-paren=4
 # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
 expected-line-ending-format=
 [MISCELLANEOUS]
 # List of note tags to take in consideration, separated by a comma.
 notes=TODO
 [STRING]
 # This flag controls whether inconsistent-quotes generates a warning when the
 # character used as a quote delimiter is used inconsistently within a module.
 check-quote-consistency=yes
 [VARIABLES]
 # Tells whether we should check for unused import in __init__ files.
 init-import=no
 # A regular expression matching the name of dummy variables (i.e. expectedly
 # not used).
 dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_)
 # List of additional names supposed to be defined in builtins. Remember that
 # you should avoid to define new builtins when possible.
 additional-builtins=
 # List of strings which can identify a callback function by name. A callback
 # name must start or end with one of those strings.
 callbacks=cb_,_cb
 # List of qualified module names which can have objects that can redefine
 # builtins.
 redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functools
 [LOGGING]
 # Logging modules to check that the string format arguments are in logging
 # function parameter format
 logging-modules=logging,absl.logging,tensorflow.io.logging
 [SIMILARITIES]
 # Minimum lines number of a similarity.
 min-similarity-lines=4
 # Ignore comments when computing similarities.
 ignore-comments=yes
 # Ignore docstrings when computing similarities.
 ignore-docstrings=yes
 # Ignore imports when computing similarities.
 ignore-imports=no
 [SPELLING]
 # Spelling dictionary name. Available dictionaries: none. To make it working
 # install python-enchant package.
 spelling-dict=
 # List of comma separated words that should not be checked.
 spelling-ignore-words=
 # A path to a file that contains private dictionary; one word per line.
 spelling-private-dict-file=
 # Tells whether to store unknown words to indicated private dictionary in
 # --spelling-private-dict-file option instead of raising a message.
 spelling-store-unknown-words=no
 [IMPORTS]
 # Deprecated modules which should not be used, separated by a comma
 deprecated-modules=regsub,
                   TERMIOS,
                   Bastion,
                   rexec,
                   sets
 # Create a graph of every (i.e. internal and external) dependencies in the
 # given file (report RP0402 must not be disabled)
 import-graph=
 # Create a graph of external dependencies in the given file (report RP0402 must
 # not be disabled)
 ext-import-graph=
 # Create a graph of internal dependencies in the given file (report RP0402 must
 # not be disabled)
 int-import-graph=
 # Force import order to recognize a module as part of the standard
 # compatibility libraries.
 known-standard-library=
 # Force import order to recognize a module as part of a third party library.
 known-third-party=enchant, absl
 # Analyse import fallback blocks. This can be used to support both Python 2 and
 # 3 compatible code, which means that the block might have code that exists
 # only in one or another interpreter, leading to false positives when analysed.
 analyse-fallback-blocks=no
 [CLASSES]
 # List of method names used to declare (i.e. assign) instance attributes.
 defining-attr-methods=__init__,
                      __new__,
                      setUp
 # List of member names, which should be excluded from the protected access
 # warning.
 exclude-protected=_asdict,
                  _fields,
                  _replace,
                  _source,
                  _make
 # List of valid names for the first argument in a class method.
 valid-classmethod-first-arg=cls,
                            class_
 # List of valid names for the first argument in a metaclass class method.
 valid-metaclass-classmethod-first-arg=mcs
 [EXCEPTIONS]
 # Exceptions that will emit a warning when being caught. Defaults to
 # "Exception"
 overgeneral-exceptions=StandardError,
                       Exception,
                       BaseException
--- a/77
+++ b/77
@@ -0,0 +1,77 @@
 FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
 RUN apt-get update -y \
    && apt-get install -y python3-pip
 WORKDIR /workspace
 # install build and runtime dependencies
 COPY requirements.txt requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements.txt
 # install development dependencies
 COPY requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements-dev.txt
 # image to build pytorch extensions
 FROM dev AS build
 # install build dependencies
 COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements-build.txt
 # copy input files
 COPY csrc csrc
 COPY setup.py setup.py
 COPY requirements.txt requirements.txt
 COPY pyproject.toml pyproject.toml
 COPY vllm/__init__.py vllm/__init__.py
 # max jobs used by Ninja to build extensions
 ENV MAX_JOBS=$max_jobs
 RUN python3 setup.py build_ext --inplace
 # image to run unit testing suite
 FROM dev AS test
 # copy pytorch extensions separately to avoid having to rebuild
 # when python code changes
 COPY --from=build /workspace/vllm/*.so /workspace/vllm/
 COPY tests tests
 COPY vllm vllm
 ENTRYPOINT ["python3", "-m", "pytest", "tests"]
 # use CUDA base as CUDA runtime dependencies are already installed via pip
 FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
 # libnccl required for ray
 RUN apt-get update -y \
    && apt-get install -y python3-pip
 WORKDIR /workspace
 COPY requirements.txt requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements.txt
 FROM vllm-base AS vllm
 COPY --from=build /workspace/vllm/*.so /workspace/vllm/
 COPY vllm vllm
 EXPOSE 8000
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"]
 # openai api server alternative
 FROM vllm-base AS vllm-openai
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install accelerate fschat
 COPY --from=build /workspace/vllm/*.so /workspace/vllm/
 COPY vllm vllm
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/README.md
+++ b/README.md
@@ -10,22 +10,14 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>
 <p align="center">
-| <a href="https://vllm.readthedocs.io/en/latest/"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> |
 </p>
 ---
 **The First vLLM Bay Area Meetup (Oct 5th 6pm-8pm PT)**
 We are excited to invite you to the first vLLM meetup!
 The vLLM team will share recent updates and roadmap.
 We will also have vLLM users and contributors coming up to the stage to share their experiences.
 Please register [here](https://lu.ma/first-vllm-meetup) and join us!
 ---
 *Latest News* 🔥
 - [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
 - [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
 - [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
 - [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
@@ -54,9 +46,10 @@ vLLM is flexible and easy to use with:
 vLLM seamlessly supports many Hugging Face models, including the following architectures:
- Aquila (`BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
+- Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
- Baichuan (`baichuan-inc/Baichuan-7B`, `baichuan-inc/Baichuan-13B-Chat`, etc.)
+- Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.)
 - BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
 - ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
 - Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
 - GPT-2 (`gpt2`, `gpt2-xl`, etc.)
 - GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
@@ -67,7 +60,9 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
 - Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
 - MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
 - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
 - Phi-1.5 (`microsoft/phi-1_5`, etc.)
 - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
 - Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
 Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
@@ -82,36 +77,6 @@ Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started
 - [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
 - [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
 ## Performance
 vLLM outperforms Hugging Face Transformers (HF) by up to 24x and Text Generation Inference (TGI) by up to 3.5x, in terms of throughput.
 For details, check out our [blog post](https://vllm.ai).
 <p align="center">
  <picture>
  <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a10g_n1_dark.png">
  <img src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a10g_n1_light.png" width="45%">
  </picture>
  <picture>
  <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a100_n1_dark.png">
  <img src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a100_n1_light.png" width="45%">
  </picture>
  <br>
  <em> Serving throughput when each request asks for 1 output completion. </em>
 </p>
 <p align="center">
  <picture>
  <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a10g_n3_dark.png">
  <img src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a10g_n3_light.png" width="45%">
  </picture>
  <picture>
  <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a100_n3_dark.png">
  <img src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a100_n3_light.png" width="45%">
  </picture>  <br>
  <em> Serving throughput when each request asks for 3 output completions. </em>
 </p>
 ## Contributing
 We welcome and value any contributions and collaborations.
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -12,7 +12,6 @@ from vllm import LLM, SamplingParams
 def main(args: argparse.Namespace):
    print(args)
    # Process all the requests in a single batch if possible.
    # NOTE(woosuk): If the request cannot be processed in a single batch,
    # the engine will automatically process the request in multiple batches.
    llm = LLM(
@@ -20,9 +19,8 @@ def main(args: argparse.Namespace):
        tokenizer=args.tokenizer,
        quantization=args.quantization,
        tensor_parallel_size=args.tensor_parallel_size,
        max_num_seqs=args.batch_size,
        max_num_batched_tokens=args.batch_size * args.input_len,
        trust_remote_code=args.trust_remote_code,
        dtype=args.dtype,
    )
    sampling_params = SamplingParams(
@@ -38,22 +36,31 @@ def main(args: argparse.Namespace):
    def run_to_completion(profile: bool = False):
        if profile:
-            torch.cuda.cudart().cudaProfilerStart()
+            with torch.profiler.profile(activities=[
-        start_time = time.time()
+                    torch.profiler.ProfilerActivity.CPU,
-
+                    torch.profiler.ProfilerActivity.CUDA,
-        llm.generate(prompt_token_ids=dummy_prompt_token_ids,
+            ]) as p:
-                     sampling_params=sampling_params,
+                llm.generate(prompt_token_ids=dummy_prompt_token_ids,
-                     use_tqdm=False)
+                             sampling_params=sampling_params,
-
+                             use_tqdm=False)
-        end_time = time.time()
+            print(p.key_averages())
-        latency = end_time - start_time
+        else:
-        if profile:
+            start_time = time.perf_counter()
-            torch.cuda.cudart().cudaProfilerStop()
+            llm.generate(prompt_token_ids=dummy_prompt_token_ids,
-        return latency
+                         sampling_params=sampling_params,
                         use_tqdm=False)
            end_time = time.perf_counter()
            latency = end_time - start_time
            return latency
    print("Warming up...")
    run_to_completion(profile=False)
    if args.profile:
        print("Profiling...")
        run_to_completion(profile=True)
        return
    # Benchmark.
    latencies = []
    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
@@ -69,7 +76,7 @@ if __name__ == '__main__':
    parser.add_argument('--tokenizer', type=str, default=None)
    parser.add_argument('--quantization',
                        '-q',
-                        choices=['awq', None],
+                        choices=['awq', 'squeezellm', None],
                        default=None)
    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
    parser.add_argument('--input-len', type=int, default=32)
@@ -87,5 +94,18 @@ if __name__ == '__main__':
    parser.add_argument('--trust-remote-code',
                        action='store_true',
                        help='trust remote code from huggingface')
    parser.add_argument(
        '--dtype',
        type=str,
        default='auto',
        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
        help='data type for model weights and activations. '
        'The "auto" option will use FP16 precision '
        'for FP32 and FP16 models, and BF16 precision '
        'for BF16 models.')
    parser.add_argument(
        '--profile',
        action='store_true',
        help='profile the generation process of a single batch')
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -105,7 +105,7 @@ async def send_request(
    best_of: int,
    use_beam_search: bool,
 ) -> None:
-    request_start_time = time.time()
+    request_start_time = time.perf_counter()
    headers = {"User-Agent": "Benchmark Client"}
    if backend == "vllm":
@@ -148,7 +148,7 @@ async def send_request(
            if "error" not in output:
                break
-    request_end_time = time.time()
+    request_end_time = time.perf_counter()
    request_latency = request_end_time - request_start_time
    REQUEST_LATENCY.append((prompt_len, output_len, request_latency))
@@ -180,10 +180,10 @@ def main(args: argparse.Namespace):
    tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
    input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
-    benchmark_start_time = time.time()
+    benchmark_start_time = time.perf_counter()
    asyncio.run(benchmark(args.backend, api_url, input_requests, args.best_of,
                          args.use_beam_search, args.request_rate))
-    benchmark_end_time = time.time()
+    benchmark_end_time = time.perf_counter()
    benchmark_time = benchmark_end_time - benchmark_start_time
    print(f"Total time: {benchmark_time:.2f} s")
    print(f"Throughput: {args.num_prompts / benchmark_time:.2f} requests/s")
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -6,18 +6,20 @@ import time
 from typing import List, Optional, Tuple
 import torch
-from transformers import AutoModelForCausalLM, PreTrainedTokenizerBase
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          PreTrainedTokenizerBase)
 from tqdm import tqdm
 from vllm import LLM, SamplingParams
 from vllm.transformers_utils.tokenizer import get_tokenizer
 def sample_requests(
    dataset_path: str,
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
    fixed_output_len: Optional[int],
 ) -> List[Tuple[str, int, int]]:
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")
    # Load the dataset.
    with open(dataset_path) as f:
        dataset = json.load(f)
@@ -35,6 +37,8 @@ def sample_requests(
    tokenized_dataset = []
    for i in range(len(dataset)):
        output_len = len(completion_token_ids[i])
        if fixed_output_len is not None:
            output_len = fixed_output_len
        tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
    # Filter out too long sequences.
@@ -64,7 +68,10 @@ def run_vllm(
    n: int,
    use_beam_search: bool,
    trust_remote_code: bool,
    dtype: str,
    max_model_len: Optional[int] = None,
 ) -> float:
    from vllm import LLM, SamplingParams
    llm = LLM(
        model=model,
        tokenizer=tokenizer,
@@ -72,6 +79,8 @@ def run_vllm(
        tensor_parallel_size=tensor_parallel_size,
        seed=seed,
        trust_remote_code=trust_remote_code,
        dtype=dtype,
        max_model_len=max_model_len,
    )
    # Add the requests to the engine.
@@ -91,10 +100,10 @@ def run_vllm(
            sampling_params=sampling_params,
        )
-    start = time.time()
+    start = time.perf_counter()
-    # FIXME(woosuk): Do use internal method.
+    # FIXME(woosuk): Do not use internal method.
    llm._run_engine(use_tqdm=True)
-    end = time.time()
+    end = time.perf_counter()
    return end - start
@@ -116,7 +125,7 @@ def run_hf(
    llm = llm.cuda()
    pbar = tqdm(total=len(requests))
-    start = time.time()
+    start = time.perf_counter()
    batch: List[str] = []
    max_prompt_len = 0
    max_output_len = 0
@@ -154,7 +163,23 @@ def run_hf(
        batch = []
        max_prompt_len = 0
        max_output_len = 0
-    end = time.time()
+    end = time.perf_counter()
    return end - start
 def run_mii(
    requests: List[Tuple[str, int, int]],
    model: str,
    tensor_parallel_size: int,
    output_len: int,
 ) -> float:
    from mii import pipeline
    llm = pipeline(model, tensor_parallel=tensor_parallel_size)
    prompts = [prompt for prompt, _, _ in requests]
    start = time.perf_counter()
    llm(prompts, max_new_tokens=output_len)
    end = time.perf_counter()
    return end - start
@@ -163,20 +188,31 @@ def main(args: argparse.Namespace):
    random.seed(args.seed)
    # Sample the requests.
-    tokenizer = get_tokenizer(args.tokenizer,
+    tokenizer = AutoTokenizer.from_pretrained(
-                              trust_remote_code=args.trust_remote_code)
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
-    requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
+    if args.dataset is None:
        # Synthesize a prompt with the given input length.
        prompt = "hi" * (args.input_len - 1)
        requests = [(prompt, args.input_len, args.output_len)
                    for _ in range(args.num_prompts)]
    else:
        requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
                                   args.output_len)
    if args.backend == "vllm":
        elapsed_time = run_vllm(requests, args.model, args.tokenizer,
                                args.quantization, args.tensor_parallel_size,
                                args.seed, args.n, args.use_beam_search,
-                                args.trust_remote_code)
+                                args.trust_remote_code, args.dtype,
                                args.max_model_len)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
                              args.use_beam_search, args.hf_max_batch_size,
                              args.trust_remote_code)
    elif args.backend == "mii":
        elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
                               args.output_len)
    else:
        raise ValueError(f"Unknown backend: {args.backend}")
    total_num_tokens = sum(prompt_len + output_len
@@ -189,17 +225,26 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Benchmark the throughput.")
    parser.add_argument("--backend",
                        type=str,
-                        choices=["vllm", "hf"],
+                        choices=["vllm", "hf", "mii"],
                        default="vllm")
    parser.add_argument("--dataset",
                        type=str,
-                        required=True,
+                        default=None,
                        help="Path to the dataset.")
    parser.add_argument("--input-len",
                        type=int,
                        default=None,
                        help="Input prompt length for each request")
    parser.add_argument("--output-len",
                        type=int,
                        default=None,
                        help="Output length for each request. Overrides the "
                        "output length from the dataset.")
    parser.add_argument("--model", type=str, default="facebook/opt-125m")
    parser.add_argument("--tokenizer", type=str, default=None)
    parser.add_argument('--quantization',
                        '-q',
-                        choices=['awq', None],
+                        choices=['awq', 'squeezellm', None],
                        default=None)
    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
    parser.add_argument("--n",
@@ -219,7 +264,29 @@ if __name__ == "__main__":
    parser.add_argument('--trust-remote-code',
                        action='store_true',
                        help='trust remote code from huggingface')
    parser.add_argument(
        '--max-model-len',
        type=int,
        default=None,
        help='Maximum length of a sequence (including prompt and output). '
        'If None, will be derived from the model.')
    parser.add_argument(
        '--dtype',
        type=str,
        default='auto',
        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
        help='data type for model weights and activations. '
        'The "auto" option will use FP16 precision '
        'for FP32 and FP16 models, and BF16 precision '
        'for BF16 models.')
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model
    if args.dataset is None:
        assert args.input_len is not None
        assert args.output_len is not None
    else:
        assert args.input_len is None
    if args.backend == "vllm":
        if args.hf_max_batch_size is not None:
@@ -229,7 +296,18 @@ if __name__ == "__main__":
            raise ValueError("HF max batch size is required for HF backend.")
        if args.quantization is not None:
            raise ValueError("Quantization is only for vLLM backend.")
-    if args.tokenizer is None:
+    elif args.backend == "mii":
-        args.tokenizer = args.model
+        if args.dtype != "auto":
-
+            raise ValueError("dtype must be auto for MII backend.")
        if args.n != 1:
            raise ValueError("n must be 1 for MII backend.")
        if args.use_beam_search:
            raise ValueError("Beam search is not supported for MII backend.")
        if args.quantization is not None:
            raise ValueError("Quantization is only for vLLM backend.")
        if args.hf_max_batch_size is not None:
            raise ValueError("HF max batch size is only for HF backend.")
        if args.tokenizer != args.model:
            raise ValueError("Tokenizer must be the same as the model for MII "
                             "backend.")
    main(args)
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -0,0 +1,197 @@
 import argparse
 import random
 import time
 import torch
 from vllm._C import ops
 NUM_BLOCKS = 1024
 PARTITION_SIZE = 512
@torch.inference_mode()
 def main(
    version: str,
    num_seqs: int,
    context_len: int,
    num_query_heads: int,
    num_kv_heads: int,
    head_size: int,
    use_alibi: bool,
    block_size: int,
    dtype: torch.dtype,
    seed: int,
    do_profile: bool,
 ) -> None:
    random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    scale = float(1.0 / (head_size**0.5))
    query = torch.empty(num_seqs,
                        num_query_heads,
                        head_size,
                        dtype=dtype,
                        device="cuda")
    query.uniform_(-scale, scale)
    assert num_query_heads % num_kv_heads == 0
    num_queries_per_kv = num_query_heads // num_kv_heads
    head_mapping = torch.repeat_interleave(
        torch.arange(num_kv_heads, dtype=torch.int32, device="cuda"),
        num_queries_per_kv)
    alibi_slopes = None
    if use_alibi:
        alibi_slopes = torch.randn(num_query_heads,
                                   dtype=torch.float,
                                   device="cuda")
    context_lens = [context_len for _ in range(num_seqs)]
    max_context_len = max(context_lens)
    context_lens = torch.tensor(context_lens, dtype=torch.int, device="cuda")
    # Create the block tables.
    max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
    block_tables = []
    for _ in range(num_seqs):
        block_table = [
            random.randint(0, NUM_BLOCKS - 1)
            for _ in range(max_num_blocks_per_seq)
        ]
        block_tables.append(block_table)
    block_tables = torch.tensor(block_tables, dtype=torch.int, device="cuda")
    # Create the KV cache.
    x = 16 // torch.tensor([], dtype=dtype).element_size()
    key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x, block_size, x)
    key_cache = torch.empty(size=key_cache_shape, dtype=dtype, device="cuda")
    key_cache.uniform_(-scale, scale)
    value_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size, block_size)
    value_cache = torch.empty(size=value_cache_shape,
                              dtype=dtype,
                              device="cuda")
    value_cache.uniform_(-scale, scale)
    # Prepare for the paged attention kernel.
    output = torch.empty_like(query)
    if version == "v2":
        num_partitions = ((max_context_len + PARTITION_SIZE - 1) //
                          PARTITION_SIZE)
        tmp_output = torch.empty(
            size=(num_seqs, num_query_heads, num_partitions, head_size),
            dtype=output.dtype,
            device=output.device,
        )
        exp_sums = torch.empty(
            size=(num_seqs, num_query_heads, num_partitions),
            dtype=torch.float32,
            device=output.device,
        )
        max_logits = torch.empty_like(exp_sums)
    def run_benchmark(num_iters: int, profile: bool = False) -> float:
        torch.cuda.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()
        for _ in range(num_iters):
            if version == "v1":
                ops.paged_attention_v1(
                    output,
                    query,
                    key_cache,
                    value_cache,
                    head_mapping,
                    scale,
                    block_tables,
                    context_lens,
                    block_size,
                    max_context_len,
                    alibi_slopes,
                )
            elif version == "v2":
                ops.paged_attention_v2(
                    output,
                    exp_sums,
                    max_logits,
                    tmp_output,
                    query,
                    key_cache,
                    value_cache,
                    head_mapping,
                    scale,
                    block_tables,
                    context_lens,
                    block_size,
                    max_context_len,
                    alibi_slopes,
                )
            else:
                raise ValueError(f"Invalid version: {version}")
        torch.cuda.synchronize()
        end_time = time.perf_counter()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        return (end_time - start_time) / num_iters
    # Warmup.
    print("Warming up...")
    run_benchmark(num_iters=3, profile=False)
    # Benchmark.
    if do_profile:
        latency = run_benchmark(num_iters=1, profile=True)
    else:
        latency = run_benchmark(num_iters=100, profile=False)
    print(f"Kernel running time: {latency * 1000000:.3f} us")
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Benchmark the paged attention kernel.")
    parser.add_argument("--version",
                        type=str,
                        choices=["v1", "v2"],
                        default="v2")
    parser.add_argument("--batch-size", type=int, default=8)
    parser.add_argument("--context-len", type=int, default=4096)
    parser.add_argument("--num-query-heads", type=int, default=64)
    parser.add_argument("--num-kv-heads", type=int, default=8)
    parser.add_argument("--head-size",
                        type=int,
                        choices=[64, 80, 96, 112, 128, 256],
                        default=128)
    parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
    parser.add_argument("--use-alibi", action="store_true")
    parser.add_argument("--dtype",
                        type=str,
                        choices=["half", "bfloat16", "float"],
                        default="half")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--profile", action="store_true")
    args = parser.parse_args()
    print(args)
    if args.num_query_heads % args.num_kv_heads != 0:
        raise ValueError("num_query_heads must be divisible by num_kv_heads")
    dtype_to_torch_dtype = {
        "half": torch.half,
        "bfloat16": torch.bfloat16,
        "float": torch.float,
    }
    main(
        version=args.version,
        num_seqs=args.batch_size,
        context_len=args.context_len,
        num_query_heads=args.num_query_heads,
        num_kv_heads=args.num_kv_heads,
        head_size=args.head_size,
        block_size=args.block_size,
        use_alibi=args.use_alibi,
        dtype=dtype_to_torch_dtype[args.dtype],
        seed=args.seed,
        do_profile=args.profile,
    )
--- a/csrc/activation.cpp
+++ b/csrc/activation.cpp
@@ -1,28 +0,0 @@
 #include <torch/extension.h>
 void silu_and_mul(
  torch::Tensor& out,
  torch::Tensor& input);
 void gelu_new(
  torch::Tensor& out,
  torch::Tensor& input);
 void gelu_fast(
  torch::Tensor& out,
  torch::Tensor& input);
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def(
    "silu_and_mul",
    &silu_and_mul,
    "Activation function used in SwiGLU.");
  m.def(
    "gelu_new",
    &gelu_new,
    "GELU implementation used in GPT-2.");
  m.def(
    "gelu_fast",
    &gelu_fast,
    "Approximate GELU implementation.");
 }
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -13,11 +13,11 @@ __device__ __forceinline__ T silu(const T& x) {
 template<typename scalar_t>
 __global__ void silu_and_mul_kernel(
-  scalar_t* __restrict__ out,               // [num_tokens, d]
+  scalar_t* __restrict__ out,               // [..., d]
-  const scalar_t* __restrict__ input,       // [num_tokens, 2, d]
+  const scalar_t* __restrict__ input,       // [..., 2, d]
  const int d) {
-  const int token_idx = blockIdx.x;
+  const int64_t token_idx = blockIdx.x;
-  for (int idx = threadIdx.x; idx < d; idx += blockDim.x) {
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
    const scalar_t x = __ldg(&input[token_idx * 2 * d + idx]);
    const scalar_t y = __ldg(&input[token_idx * 2 * d + d + idx]);
    out[token_idx * d + idx] = silu(x) * y;
@@ -27,11 +27,11 @@ __global__ void silu_and_mul_kernel(
 } // namespace vllm
 void silu_and_mul(
-  torch::Tensor& out,      // [num_tokens, d]
+  torch::Tensor& out,      // [..., d]
-  torch::Tensor& input)    // [num_tokens, 2 * d]
+  torch::Tensor& input)    // [..., 2 * d]
 {
-  int num_tokens = input.size(0);
+  int64_t num_tokens = input.numel() / input.size(-1);
-  int d = input.size(1) / 2;
+  int d = input.size(-1) / 2;
  dim3 grid(num_tokens);
  dim3 block(std::min(d, 1024));
@@ -52,11 +52,11 @@ namespace vllm {
 // Element-wise activation kernel template.
 template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
 __global__ void activation_kernel(
-  scalar_t* __restrict__ out,               // [num_tokens, d]
+  scalar_t* __restrict__ out,               // [..., d]
-  const scalar_t* __restrict__ input,       // [num_tokens, d]
+  const scalar_t* __restrict__ input,       // [..., d]
  const int d) {
-  const int token_idx = blockIdx.x;
+  const int64_t token_idx = blockIdx.x;
-  for (int idx = threadIdx.x; idx < d; idx += blockDim.x) {
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
    const scalar_t x = __ldg(&input[token_idx * d + idx]);
    out[token_idx * d + idx] = ACT_FN(x);
  }
@@ -66,8 +66,8 @@ __global__ void activation_kernel(
 // Launch element-wise activation kernel.
 #define LAUNCH_ACTIVATION_KERNEL(KERNEL)                                                  \
-  int num_tokens = input.size(0);                                                         \
+  int d = input.size(-1);                                                                 \
-  int d = input.size(1);                                                                  \
+  int64_t num_tokens = input.numel() / d;                                                 \
  dim3 grid(num_tokens);                                                                  \
  dim3 block(std::min(d, 1024));                                                          \
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                           \
@@ -100,15 +100,15 @@ __device__ __forceinline__ T gelu_fast_kernel(const T& x) {
 } // namespace vllm
 void gelu_new(
-  torch::Tensor& out,     // [num_tokens, d]
+  torch::Tensor& out,     // [..., d]
-  torch::Tensor& input)   // [num_tokens, d]
+  torch::Tensor& input)   // [..., d]
 {
  LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
 }
 void gelu_fast(
-  torch::Tensor& out,     // [num_tokens, d]
+  torch::Tensor& out,     // [..., d]
-  torch::Tensor& input)   // [num_tokens, d]
+  torch::Tensor& input)   // [..., d]
 {
  LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
 }
--- a/csrc/attention.cpp
+++ b/csrc/attention.cpp
@@ -1,22 +0,0 @@
 #include <torch/extension.h>
 #include <c10/util/Optional.h>
 void single_query_cached_kv_attention(
  torch::Tensor& out,
  torch::Tensor& query,
  torch::Tensor& key_cache,
  torch::Tensor& value_cache,
  torch::Tensor& head_mapping,
  float scale,
  torch::Tensor& block_tables,
  torch::Tensor& context_lens,
  int block_size,
  int max_context_len,
  const c10::optional<torch::Tensor>& alibi_slopes);
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def(
    "single_query_cached_kv_attention",
    &single_query_cached_kv_attention,
    "Compute the attention between an input query and the cached key/value tensors");
 }
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -26,6 +26,7 @@
 #define WARP_SIZE 32
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
 namespace vllm {
@@ -65,14 +66,18 @@ inline __device__ float block_sum(float* red_smem, float sum) {
  return __shfl_sync(uint32_t(-1), sum, 0);
 }
-// Grid: (num_heads, num_seqs).
+// TODO(woosuk): Merge the last two dimensions of the grid.
 // Grid: (num_heads, num_seqs, max_num_partitions).
 template<
  typename scalar_t,
  int HEAD_SIZE,
  int BLOCK_SIZE,
-  int NUM_THREADS>
+  int NUM_THREADS,
-__global__ void single_query_cached_kv_attention_kernel(
+  int PARTITION_SIZE = 0> // Zero means no partitioning.
-  scalar_t* __restrict__ out,             // [num_seqs, num_heads, head_size]
+__device__ void paged_attention_kernel(
  float* __restrict__ exp_sums,           // [num_seqs, num_heads, max_num_partitions]
  float* __restrict__ max_logits,         // [num_seqs, num_heads, max_num_partitions]
  scalar_t* __restrict__ out,             // [num_seqs, num_heads, max_num_partitions, head_size]
  const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
  const scalar_t* __restrict__ k_cache,   // [num_blocks, num_kv_heads, head_size/x, block_size, x]
  const scalar_t* __restrict__ v_cache,   // [num_blocks, num_kv_heads, head_size, block_size]
@@ -85,10 +90,33 @@ __global__ void single_query_cached_kv_attention_kernel(
  const int q_stride,
  const int kv_block_stride,
  const int kv_head_stride) {
  const int seq_idx = blockIdx.y;
  const int partition_idx = blockIdx.z;
  const int max_num_partitions = gridDim.z;
  constexpr bool USE_PARTITIONING = PARTITION_SIZE > 0;
  const int context_len = context_lens[seq_idx];
  if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= context_len) {
    // No work to do. Terminate the thread block.
    return;
  }
  const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
  const int num_blocks_per_partition = USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_context_blocks;
  // [start_block_idx, end_block_idx) is the range of blocks to process.
  const int start_block_idx = USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0;
  const int end_block_idx = MIN(start_block_idx + num_blocks_per_partition, num_context_blocks);
  const int num_blocks = end_block_idx - start_block_idx;
  // [start_token_idx, end_token_idx) is the range of tokens to process.
  const int start_token_idx = start_block_idx * BLOCK_SIZE;
  const int end_token_idx = MIN(start_token_idx + num_blocks * BLOCK_SIZE, context_len);
  const int num_tokens = end_token_idx - start_token_idx;
  constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
  constexpr int NUM_THREAD_GROUPS = NUM_THREADS / THREAD_GROUP_SIZE; // Note: This assumes THREAD_GROUP_SIZE divides NUM_THREADS
  assert(NUM_THREADS % THREAD_GROUP_SIZE == 0);
-  constexpr int NUM_TOKENS_PER_THREAD_GROUP = (BLOCK_SIZE + WARP_SIZE - 1) / WARP_SIZE;
+  constexpr int NUM_TOKENS_PER_THREAD_GROUP = DIVIDE_ROUND_UP(BLOCK_SIZE, WARP_SIZE);
  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
  const int thread_idx = threadIdx.x;
  const int warp_idx = thread_idx / WARP_SIZE;
@@ -97,7 +125,6 @@ __global__ void single_query_cached_kv_attention_kernel(
  const int head_idx = blockIdx.x;
  const int num_heads = gridDim.x;
  const int kv_head_idx = head_mapping[head_idx];
  const int seq_idx = blockIdx.y;
  const float alibi_slope = alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx];
  // A vector type to store a part of a key or a query.
@@ -142,16 +169,16 @@ __global__ void single_query_cached_kv_attention_kernel(
  constexpr int x = 16 / sizeof(scalar_t);
  float qk_max = -FLT_MAX;
  const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
  const int context_len = context_lens[seq_idx];
  const int num_blocks = (context_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
  // Iterate over the key blocks.
  // Each warp fetches a block of keys for each iteration.
  // Each thread group in a warp fetches a key from the block, and computes
  // dot product with the query.
-  for (int block_idx = warp_idx; block_idx < num_blocks; block_idx += NUM_WARPS) {
+  const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
-    const int physical_block_number = block_table[block_idx];
+  for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) {
    // NOTE(woosuk): The block number is stored in int32. However, we cast it to int64
    // because int32 can lead to overflow when this variable is multiplied by large numbers
    // (e.g., kv_block_stride).
    const int64_t physical_block_number = static_cast<int64_t>(block_table[block_idx]);
    // Load a key to registers.
    // Each thread in a thread group has a different part of the key.
@@ -184,7 +211,7 @@ __global__ void single_query_cached_kv_attention_kernel(
        // Store the partial reductions to shared memory.
        // NOTE(woosuk): It is required to zero out the masked logits.
        const bool mask = token_idx >= context_len;
-        logits[token_idx] = mask ? 0.f : qk;
+        logits[token_idx - start_token_idx] = mask ? 0.f : qk;
        // Update the max value.
        qk_max = mask ? qk_max : fmaxf(qk_max, qk);
      }
@@ -215,7 +242,7 @@ __global__ void single_query_cached_kv_attention_kernel(
  // Get the sum of the exp values.
  float exp_sum = 0.f;
-  for (int i = thread_idx; i < context_len; i += NUM_THREADS) {
+  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
    float val = __expf(logits[i] - qk_max);
    logits[i] = val;
    exp_sum += val;
@@ -224,11 +251,23 @@ __global__ void single_query_cached_kv_attention_kernel(
  // Compute softmax.
  const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
-  for (int i = thread_idx; i < context_len; i += NUM_THREADS) {
+  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
    logits[i] *= inv_sum;
  }
  __syncthreads();
  // If partitioning is enabled, store the max logit and exp_sum.
  if (USE_PARTITIONING && thread_idx == 0) {
    float* max_logits_ptr = max_logits + seq_idx * num_heads * max_num_partitions
                                       + head_idx * max_num_partitions
                                       + partition_idx;
    *max_logits_ptr = qk_max;
    float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions
                                   + head_idx * max_num_partitions
                                   + partition_idx;
    *exp_sums_ptr = exp_sum;
  }
  // Each thread will fetch 16 bytes from the value cache at a time.
  constexpr int V_VEC_SIZE = MIN(16 / sizeof(scalar_t), BLOCK_SIZE);
  using V_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type;
@@ -237,7 +276,7 @@ __global__ void single_query_cached_kv_attention_kernel(
  constexpr int NUM_V_VECS_PER_ROW = BLOCK_SIZE / V_VEC_SIZE;
  constexpr int NUM_ROWS_PER_ITER = WARP_SIZE / NUM_V_VECS_PER_ROW;
-  constexpr int NUM_ROWS_PER_THREAD = (HEAD_SIZE + NUM_ROWS_PER_ITER - 1) / NUM_ROWS_PER_ITER;
+  constexpr int NUM_ROWS_PER_THREAD = DIVIDE_ROUND_UP(HEAD_SIZE, NUM_ROWS_PER_ITER);
  // NOTE(woosuk): We use FP32 for the accumulator for better accuracy.
  float accs[NUM_ROWS_PER_THREAD];
@@ -248,12 +287,15 @@ __global__ void single_query_cached_kv_attention_kernel(
  scalar_t zero_value;
  zero(zero_value);
-  for (int block_idx = warp_idx; block_idx < num_blocks; block_idx += NUM_WARPS) {
+  for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) {
-    const int physical_block_number = block_table[block_idx];
+    // NOTE(woosuk): The block number is stored in int32. However, we cast it to int64
    // because int32 can lead to overflow when this variable is multiplied by large numbers
    // (e.g., kv_block_stride).
    const int64_t physical_block_number = static_cast<int64_t>(block_table[block_idx]);
    const int physical_block_offset = (lane % NUM_V_VECS_PER_ROW) * V_VEC_SIZE;
    const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
    L_vec logits_vec;
-    from_float(logits_vec, *reinterpret_cast<Float_L_vec*>(logits + token_idx));
+    from_float(logits_vec, *reinterpret_cast<Float_L_vec*>(logits + token_idx - start_token_idx));
    const scalar_t* v_ptr = v_cache + physical_block_number * kv_block_stride
                                    + kv_head_idx * kv_head_stride;
@@ -263,13 +305,13 @@ __global__ void single_query_cached_kv_attention_kernel(
      if (row_idx < HEAD_SIZE) {
        const int offset = row_idx * BLOCK_SIZE + physical_block_offset;
        V_vec v_vec = *reinterpret_cast<const V_vec*>(v_ptr + offset);
-        if (block_idx == num_blocks - 1) {
+        if (block_idx == num_context_blocks - 1) {
          // NOTE(woosuk): When v_vec contains the tokens that are out of the context,
          // we should explicitly zero out the values since they may contain NaNs.
          // See https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
          scalar_t* v_vec_ptr = reinterpret_cast<scalar_t*>(&v_vec);
 #pragma unroll
-          for (int j = 0; j <= V_VEC_SIZE; j++) {
+          for (int j = 0; j < V_VEC_SIZE; j++) {
            v_vec_ptr[j] = token_idx + j < context_len ? v_vec_ptr[j] : zero_value;
          }
        }
@@ -327,7 +369,9 @@ __global__ void single_query_cached_kv_attention_kernel(
  // Write the final output.
  if (warp_idx == 0) {
-    scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
+    scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
                            + head_idx * max_num_partitions * HEAD_SIZE
                            + partition_idx * HEAD_SIZE;
 #pragma unroll
    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
      const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
@@ -338,13 +382,167 @@ __global__ void single_query_cached_kv_attention_kernel(
  }
 }
 // Grid: (num_heads, num_seqs, 1).
 template<
  typename scalar_t,
  int HEAD_SIZE,
  int BLOCK_SIZE,
  int NUM_THREADS>
 __global__ void paged_attention_v1_kernel(
  scalar_t* __restrict__ out,             // [num_seqs, num_heads, head_size]
  const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
  const scalar_t* __restrict__ k_cache,   // [num_blocks, num_kv_heads, head_size/x, block_size, x]
  const scalar_t* __restrict__ v_cache,   // [num_blocks, num_kv_heads, head_size, block_size]
  const int* __restrict__ head_mapping,   // [num_heads]
  const float scale,
  const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
  const int* __restrict__ context_lens,   // [num_seqs]
  const int max_num_blocks_per_seq,
  const float* __restrict__ alibi_slopes, // [num_heads]
  const int q_stride,
  const int kv_block_stride,
  const int kv_head_stride) {
  paged_attention_kernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS>(
    /* exp_sums */ nullptr, /* max_logits */ nullptr,
    out, q, k_cache, v_cache, head_mapping, scale, block_tables, context_lens,
    max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride);
 }
 // Grid: (num_heads, num_seqs, max_num_partitions).
 template<
  typename scalar_t,
  int HEAD_SIZE,
  int BLOCK_SIZE,
  int NUM_THREADS,
  int PARTITION_SIZE>
 __global__ void paged_attention_v2_kernel(
  float* __restrict__ exp_sums,           // [num_seqs, num_heads, max_num_partitions]
  float* __restrict__ max_logits,         // [num_seqs, num_heads, max_num_partitions]
  scalar_t* __restrict__ tmp_out,         // [num_seqs, num_heads, max_num_partitions, head_size]
  const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
  const scalar_t* __restrict__ k_cache,   // [num_blocks, num_kv_heads, head_size/x, block_size, x]
  const scalar_t* __restrict__ v_cache,   // [num_blocks, num_kv_heads, head_size, block_size]
  const int* __restrict__ head_mapping,   // [num_heads]
  const float scale,
  const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
  const int* __restrict__ context_lens,   // [num_seqs]
  const int max_num_blocks_per_seq,
  const float* __restrict__ alibi_slopes, // [num_heads]
  const int q_stride,
  const int kv_block_stride,
  const int kv_head_stride) {
  paged_attention_kernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, PARTITION_SIZE>(
    exp_sums, max_logits, tmp_out, q, k_cache, v_cache, head_mapping, scale,
    block_tables, context_lens, max_num_blocks_per_seq, alibi_slopes,
    q_stride, kv_block_stride, kv_head_stride);
 }
 // Grid: (num_heads, num_seqs).
 template<
  typename scalar_t,
  int HEAD_SIZE,
  int NUM_THREADS,
  int PARTITION_SIZE>
 __global__ void paged_attention_v2_reduce_kernel(
  scalar_t* __restrict__ out,             // [num_seqs, num_heads, head_size]
  const float* __restrict__ exp_sums,     // [num_seqs, num_heads, max_num_partitions]
  const float* __restrict__ max_logits,   // [num_seqs, num_heads, max_num_partitions]
  const scalar_t* __restrict__ tmp_out,   // [num_seqs, num_heads, max_num_partitions, head_size]
  const int* __restrict__ context_lens,   // [num_seqs]
  const int max_num_partitions) {
  const int num_heads = gridDim.x;
  const int head_idx = blockIdx.x;
  const int seq_idx = blockIdx.y;
  const int context_len = context_lens[seq_idx];
  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
  if (num_partitions == 1) {
    // No need to reduce. Only copy tmp_out to out.
    scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
    const scalar_t* tmp_out_ptr = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
                                          + head_idx * max_num_partitions * HEAD_SIZE;
    for (int i = threadIdx.x; i < HEAD_SIZE; i += blockDim.x) {
      out_ptr[i] = tmp_out_ptr[i];
    }
    // Terminate the thread block.
    return;
  }
  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
  const int warp_idx = threadIdx.x / WARP_SIZE;
  const int lane = threadIdx.x % WARP_SIZE;
  // Size: 2 * num_partitions.
  extern __shared__ char shared_mem[];
  // Workspace for reduction.
  __shared__ float red_smem[2 * NUM_WARPS];
  // Load max logits to shared memory.
  float* shared_max_logits = reinterpret_cast<float*>(shared_mem);
  const float* max_logits_ptr = max_logits + seq_idx * num_heads * max_num_partitions
                                           + head_idx * max_num_partitions;
  float max_logit = -FLT_MAX;
  for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
    const float l = max_logits_ptr[i];
    shared_max_logits[i] = l;
    max_logit = fmaxf(max_logit, l);
  }
  __syncthreads();
  // Get the global max logit.
  // Reduce within the warp.
 #pragma unroll
  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
    max_logit = fmaxf(max_logit, __shfl_xor_sync(uint32_t(-1), max_logit, mask));
  }
  if (lane == 0) {
    red_smem[warp_idx] = max_logit;
  }
  __syncthreads();
  // Reduce across warps.
  max_logit = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
 #pragma unroll
  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
    max_logit = fmaxf(max_logit, __shfl_xor_sync(uint32_t(-1), max_logit, mask));
  }
  // Broadcast the max value to all threads.
  max_logit = __shfl_sync(uint32_t(-1), max_logit, 0);
  // Load rescaled exp sums to shared memory.
  float* shared_exp_sums = reinterpret_cast<float*>(shared_mem + sizeof(float) * num_partitions);
  const float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions
                                       + head_idx * max_num_partitions;
  float global_exp_sum = 0.0f;
  for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
    float l = shared_max_logits[i];
    float rescaled_exp_sum = exp_sums_ptr[i] * expf(l - max_logit);
    global_exp_sum += rescaled_exp_sum;
    shared_exp_sums[i] = rescaled_exp_sum;
  }
  __syncthreads();
  global_exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], global_exp_sum);
  const float inv_global_exp_sum = __fdividef(1.0f, global_exp_sum + 1e-6f);
  // Aggregate tmp_out to out.
  const scalar_t* tmp_out_ptr = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
                                        + head_idx * max_num_partitions * HEAD_SIZE;
  scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
 #pragma unroll
  for (int i = threadIdx.x; i < HEAD_SIZE; i += NUM_THREADS) {
    float acc = 0.0f;
    for (int j = 0; j < num_partitions; ++j) {
      acc += to_float(tmp_out_ptr[j * HEAD_SIZE + i]) * shared_exp_sums[j] * inv_global_exp_sum;
    }
    from_float(out_ptr[i], acc);
  }
 }
 } // namespace vllm
-#define LAUNCH_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS)                        \
+#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE)                                                  \
  cudaFuncSetAttribute(                                                                       \
-      vllm::single_query_cached_kv_attention_kernel<T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS>,   \
+    vllm::paged_attention_v1_kernel<T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS>,                   \
-      cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem_size);                          \
+    cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem_size);                            \
-  vllm::single_query_cached_kv_attention_kernel<T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS>        \
+  vllm::paged_attention_v1_kernel<T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS>                      \
  <<<grid, block, shared_mem_size, stream>>>(                                                 \
    out_ptr,                                                                                  \
    query_ptr,                                                                                \
@@ -365,7 +563,7 @@ template<
  typename T,
  int BLOCK_SIZE,
  int NUM_THREADS = 128>
-void single_query_cached_kv_attention_launcher(
+void paged_attention_v1_launcher(
  torch::Tensor& out,
  torch::Tensor& query,
  torch::Tensor& key_cache,
@@ -401,45 +599,37 @@ void single_query_cached_kv_attention_launcher(
  int* context_lens_ptr = context_lens.data_ptr<int>();
  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
-  int padded_max_context_len = ((max_context_len + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE;
+  int padded_max_context_len = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE) * BLOCK_SIZE;
  int logits_size = padded_max_context_len * sizeof(float);
  int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
  // Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len
  // Keep that in sync with the logic here!
  int shared_mem_size = std::max(logits_size, outputs_size);
-  dim3 grid(num_heads, num_seqs);
+  dim3 grid(num_heads, num_seqs, 1);
  dim3 block(NUM_THREADS);
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  switch (head_size) {
-    // NOTE(woosuk): To reduce the compilation time, we omitted head sizes
+    // NOTE(woosuk): To reduce the compilation time, we only compile for the
-    // 32, 160, 192.
+    // head sizes that we use in the model. However, we can easily extend this
-    // case 32:
+    // to support any head size which is a multiple of 16.
    //   LAUNCH_ATTENTION_KERNEL(T, 32, BLOCK_SIZE, NUM_THREADS);
    //   break;
    case 64:
-      LAUNCH_ATTENTION_KERNEL(T, 64, BLOCK_SIZE, NUM_THREADS);
+      LAUNCH_PAGED_ATTENTION_V1(64);
      break;
    case 80:
-      LAUNCH_ATTENTION_KERNEL(T, 80, BLOCK_SIZE, NUM_THREADS);
+      LAUNCH_PAGED_ATTENTION_V1(80);
      break;
    case 96:
-      LAUNCH_ATTENTION_KERNEL(T, 96, BLOCK_SIZE, NUM_THREADS);
+      LAUNCH_PAGED_ATTENTION_V1(96);
      break;
    case 112:
-      LAUNCH_ATTENTION_KERNEL(T, 112, BLOCK_SIZE, NUM_THREADS);
+      LAUNCH_PAGED_ATTENTION_V1(112);
      break;
    case 128:
-      LAUNCH_ATTENTION_KERNEL(T, 128, BLOCK_SIZE, NUM_THREADS);
+      LAUNCH_PAGED_ATTENTION_V1(128);
      break;
    // case 160:
    //   LAUNCH_ATTENTION_KERNEL(T, 160, BLOCK_SIZE, NUM_THREADS);
    //   break;
    // case 192:
    //   LAUNCH_ATTENTION_KERNEL(T, 192, BLOCK_SIZE, NUM_THREADS);
    //   break;
    case 256:
-      LAUNCH_ATTENTION_KERNEL(T, 256, BLOCK_SIZE, NUM_THREADS);
+      LAUNCH_PAGED_ATTENTION_V1(256);
      break;
    default:
      TORCH_CHECK(false, "Unsupported head size: ", head_size);
@@ -447,8 +637,8 @@ void single_query_cached_kv_attention_launcher(
  }
 }
-#define CALL_KERNEL_LAUNCHER(T, BLOCK_SIZE)                         \
+#define CALL_V1_LAUNCHER(T, BLOCK_SIZE)                             \
-  single_query_cached_kv_attention_launcher<T, BLOCK_SIZE>(         \
+  paged_attention_v1_launcher<T, BLOCK_SIZE>(                       \
    out,                                                            \
    query,                                                          \
    key_cache,                                                      \
@@ -462,41 +652,23 @@ void single_query_cached_kv_attention_launcher(
 // NOTE(woosuk): To reduce the compilation time, we omitted block sizes
 // 1, 2, 4, 64, 128, 256.
-#define CALL_KERNEL_LAUNCHER_BLOCK_SIZE(T)                          \
+#define CALL_V1_LAUNCHER_BLOCK_SIZE(T)                              \
  switch (block_size) {                                             \
    /* case 1:                         */                           \
    /*   CALL_KERNEL_LAUNCHER(T, 1);   */                           \
    /*   break;                        */                           \
    /* case 2:                         */                           \
    /*   CALL_KERNEL_LAUNCHER(T, 2);   */                           \
    /*   break;                        */                           \
    /* case 4:                         */                           \
    /*   CALL_KERNEL_LAUNCHER(T, 4);   */                           \
    /*   break;                        */                           \
    case 8:                                                         \
-      CALL_KERNEL_LAUNCHER(T, 8);                                   \
+      CALL_V1_LAUNCHER(T, 8);                                       \
      break;                                                        \
    case 16:                                                        \
-      CALL_KERNEL_LAUNCHER(T, 16);                                  \
+      CALL_V1_LAUNCHER(T, 16);                                      \
      break;                                                        \
    case 32:                                                        \
-      CALL_KERNEL_LAUNCHER(T, 32);                                  \
+      CALL_V1_LAUNCHER(T, 32);                                      \
      break;                                                        \
    /* case 64:                        */                           \
    /*   CALL_KERNEL_LAUNCHER(T, 64);  */                           \
    /*   break;                        */                           \
    /* case 128:                       */                           \
    /*   CALL_KERNEL_LAUNCHER(T, 128); */                           \
    /*   break;                        */                           \
    /* case 256:                       */                           \
    /*   CALL_KERNEL_LAUNCHER(T, 256); */                           \
    /*   break;                        */                           \
    default:                                                        \
      TORCH_CHECK(false, "Unsupported block size: ", block_size);   \
      break;                                                        \
  }
-void single_query_cached_kv_attention(
+void paged_attention_v1(
  torch::Tensor& out,             // [num_seqs, num_heads, head_size]
  torch::Tensor& query,           // [num_seqs, num_heads, head_size]
  torch::Tensor& key_cache,       // [num_blocks, num_heads, head_size/x, block_size, x]
@@ -509,11 +681,186 @@ void single_query_cached_kv_attention(
  int max_context_len,
  const c10::optional<torch::Tensor>& alibi_slopes) {
  if (query.dtype() == at::ScalarType::Float) {
-    CALL_KERNEL_LAUNCHER_BLOCK_SIZE(float);
+    CALL_V1_LAUNCHER_BLOCK_SIZE(float);
  } else if (query.dtype() == at::ScalarType::Half) {
-    CALL_KERNEL_LAUNCHER_BLOCK_SIZE(uint16_t);
+    CALL_V1_LAUNCHER_BLOCK_SIZE(uint16_t);
  } else if (query.dtype() == at::ScalarType::BFloat16) {
-    CALL_KERNEL_LAUNCHER_BLOCK_SIZE(__nv_bfloat16);
+    CALL_V1_LAUNCHER_BLOCK_SIZE(__nv_bfloat16);
  } else {
    TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
  }
 }
 #define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE)                                                  \
  vllm::paged_attention_v2_kernel<T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, PARTITION_SIZE>      \
  <<<grid, block, shared_mem_size, stream>>>(                                                 \
    exp_sums_ptr,                                                                             \
    max_logits_ptr,                                                                           \
    tmp_out_ptr,                                                                              \
    query_ptr,                                                                                \
    key_cache_ptr,                                                                            \
    value_cache_ptr,                                                                          \
    head_mapping_ptr,                                                                         \
    scale,                                                                                    \
    block_tables_ptr,                                                                         \
    context_lens_ptr,                                                                         \
    max_num_blocks_per_seq,                                                                   \
    alibi_slopes_ptr,                                                                         \
    q_stride,                                                                                 \
    kv_block_stride,                                                                          \
    kv_head_stride);                                                                          \
  vllm::paged_attention_v2_reduce_kernel<T, HEAD_SIZE, NUM_THREADS, PARTITION_SIZE>           \
  <<<reduce_grid, block, reduce_shared_mem_size, stream>>>(                                   \
    out_ptr,                                                                                  \
    exp_sums_ptr,                                                                             \
    max_logits_ptr,                                                                           \
    tmp_out_ptr,                                                                              \
    context_lens_ptr,                                                                         \
    max_num_partitions);
 template<
  typename T,
  int BLOCK_SIZE,
  int NUM_THREADS = 128,
  int PARTITION_SIZE = 512>
 void paged_attention_v2_launcher(
  torch::Tensor& out,
  torch::Tensor& exp_sums,
  torch::Tensor& max_logits,
  torch::Tensor& tmp_out,
  torch::Tensor& query,
  torch::Tensor& key_cache,
  torch::Tensor& value_cache,
  torch::Tensor& head_mapping,
  float scale,
  torch::Tensor& block_tables,
  torch::Tensor& context_lens,
  int max_context_len,
  const c10::optional<torch::Tensor>& alibi_slopes) {
  int num_seqs = query.size(0);
  int num_heads = query.size(1);
  int head_size = query.size(2);
  int max_num_blocks_per_seq = block_tables.size(1);
  int q_stride = query.stride(0);
  int kv_block_stride = key_cache.stride(0);
  int kv_head_stride = key_cache.stride(1);
  int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
  assert(head_size % thread_group_size == 0);
  // NOTE: alibi_slopes is optional.
  const float* alibi_slopes_ptr = alibi_slopes ?
    reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
    : nullptr;
  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
  float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
  float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
  T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
  T* key_cache_ptr = reinterpret_cast<T*>(key_cache.data_ptr());
  T* value_cache_ptr = reinterpret_cast<T*>(value_cache.data_ptr());
  int* head_mapping_ptr = reinterpret_cast<int*>(head_mapping.data_ptr());
  int* block_tables_ptr = block_tables.data_ptr<int>();
  int* context_lens_ptr = context_lens.data_ptr<int>();
  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
  int max_num_partitions = DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
  int logits_size = PARTITION_SIZE * sizeof(float);
  int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
  // For paged attention v2 kernel.
  dim3 grid(num_heads, num_seqs, max_num_partitions);
  int shared_mem_size = std::max(logits_size, outputs_size);
  // For paged attention v2 reduce kernel.
  dim3 reduce_grid(num_heads, num_seqs);
  int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float);
  dim3 block(NUM_THREADS);
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  switch (head_size) {
    // NOTE(woosuk): To reduce the compilation time, we only compile for the
    // head sizes that we use in the model. However, we can easily extend this
    // to support any head size which is a multiple of 16.
    case 64:
      LAUNCH_PAGED_ATTENTION_V2(64);
      break;
    case 80:
      LAUNCH_PAGED_ATTENTION_V2(80);
      break;
    case 96:
      LAUNCH_PAGED_ATTENTION_V2(96);
      break;
    case 112:
      LAUNCH_PAGED_ATTENTION_V2(112);
      break;
    case 128:
      LAUNCH_PAGED_ATTENTION_V2(128);
      break;
    case 256:
      LAUNCH_PAGED_ATTENTION_V2(256);
      break;
    default:
      TORCH_CHECK(false, "Unsupported head size: ", head_size);
      break;
  }
 }
 #define CALL_V2_LAUNCHER(T, BLOCK_SIZE)                             \
  paged_attention_v2_launcher<T, BLOCK_SIZE>(                       \
    out,                                                            \
    exp_sums,                                                       \
    max_logits,                                                     \
    tmp_out,                                                        \
    query,                                                          \
    key_cache,                                                      \
    value_cache,                                                    \
    head_mapping,                                                   \
    scale,                                                          \
    block_tables,                                                   \
    context_lens,                                                   \
    max_context_len,                                                \
    alibi_slopes);
 // NOTE(woosuk): To reduce the compilation time, we omitted block sizes
 // 1, 2, 4, 64, 128, 256.
 #define CALL_V2_LAUNCHER_BLOCK_SIZE(T)                              \
  switch (block_size) {                                             \
    case 8:                                                         \
      CALL_V2_LAUNCHER(T, 8);                                       \
      break;                                                        \
    case 16:                                                        \
      CALL_V2_LAUNCHER(T, 16);                                      \
      break;                                                        \
    case 32:                                                        \
      CALL_V2_LAUNCHER(T, 32);                                      \
      break;                                                        \
    default:                                                        \
      TORCH_CHECK(false, "Unsupported block size: ", block_size);   \
      break;                                                        \
  }
 void paged_attention_v2(
  torch::Tensor& out,             // [num_seqs, num_heads, head_size]
  torch::Tensor& exp_sums,        // [num_seqs, num_heads, max_num_partitions]
  torch::Tensor& max_logits,      // [num_seqs, num_heads, max_num_partitions]
  torch::Tensor& tmp_out,         // [num_seqs, num_heads, max_num_partitions, head_size]
  torch::Tensor& query,           // [num_seqs, num_heads, head_size]
  torch::Tensor& key_cache,       // [num_blocks, num_heads, head_size/x, block_size, x]
  torch::Tensor& value_cache,     // [num_blocks, num_heads, head_size, block_size]
  torch::Tensor& head_mapping,    // [num_heads]
  float scale,
  torch::Tensor& block_tables,    // [num_seqs, max_num_blocks_per_seq]
  torch::Tensor& context_lens,    // [num_seqs]
  int block_size,
  int max_context_len,
  const c10::optional<torch::Tensor>& alibi_slopes) {
  if (query.dtype() == at::ScalarType::Float) {
    CALL_V2_LAUNCHER_BLOCK_SIZE(float);
  } else if (query.dtype() == at::ScalarType::Half) {
    CALL_V2_LAUNCHER_BLOCK_SIZE(uint16_t);
  } else if (query.dtype() == at::ScalarType::BFloat16) {
    CALL_V2_LAUNCHER_BLOCK_SIZE(__nv_bfloat16);
  } else {
    TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
  }
@@ -522,3 +869,4 @@ void single_query_cached_kv_attention(
 #undef WARP_SIZE
 #undef MAX
 #undef MIN
 #undef DIVIDE_ROUND_UP
--- a/csrc/attention/dtype_bfloat16.cuh
+++ b/csrc/attention/dtype_bfloat16.cuh
@@ -420,6 +420,11 @@ inline __device__ void from_float(bf16_8_t& dst, Float8_ src) {
 #endif
 }
 // From bfloat16 to float32.
 inline __device__ float to_float(__nv_bfloat16 u) {
  return __bfloat162float(u);
 }
 // Zero-out a variable.
 inline __device__ void zero(__nv_bfloat16& dst) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
--- a/csrc/cache.cpp
+++ b/csrc/cache.cpp
@@ -26,22 +26,3 @@ void gather_cached_kv(
  torch::Tensor& key_cache,
  torch::Tensor& value_cache,
  torch::Tensor& slot_mapping);
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def(
    "swap_blocks",
    &swap_blocks,
    "Swap in (out) the cache blocks from src to dst");
  m.def(
    "copy_blocks",
    &copy_blocks,
    "Copy the cache blocks from src to dst");
  m.def(
    "reshape_and_cache",
    &reshape_and_cache,
    "Reshape the key and value tensors and cache them");
  m.def(
    "gather_cached_kv",
    &gather_cached_kv,
    "Gather key and value from the cache into contiguous QKV tensors");
 }
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -55,26 +55,26 @@ template<typename scalar_t>
 __global__ void copy_blocks_kernel(
  int64_t* key_cache_ptrs,
  int64_t* value_cache_ptrs,
-  const int* __restrict__ block_mapping,
+  const int64_t* __restrict__ block_mapping,
  const int numel_per_block) {
  const int layer_idx = blockIdx.x;
  const int pair_idx = blockIdx.y;
  scalar_t* key_cache = reinterpret_cast<scalar_t*>(key_cache_ptrs[layer_idx]);
  scalar_t* value_cache = reinterpret_cast<scalar_t*>(value_cache_ptrs[layer_idx]);
-  int src_block_number = block_mapping[2 * pair_idx];
+  int64_t src_block_number = block_mapping[2 * pair_idx];
-  int dst_block_number = block_mapping[2 * pair_idx + 1];
+  int64_t dst_block_number = block_mapping[2 * pair_idx + 1];
-  const int src_block_offset = src_block_number * numel_per_block;
+  const int64_t src_block_offset = src_block_number * numel_per_block;
-  const int dst_block_offset = dst_block_number * numel_per_block;
+  const int64_t dst_block_offset = dst_block_number * numel_per_block;
  for (int i = threadIdx.x; i < numel_per_block; i += blockDim.x) {
-    int src_offset = src_block_offset + i;
+    int64_t src_offset = src_block_offset + i;
-    int dst_offset = dst_block_offset + i;
+    int64_t dst_offset = dst_block_offset + i;
    key_cache[dst_offset] = key_cache[src_offset];
  }
  for (int i = threadIdx.x; i < numel_per_block; i += blockDim.x) {
-    int src_offset = src_block_offset + i;
+    int64_t src_offset = src_block_offset + i;
-    int dst_offset = dst_block_offset + i;
+    int64_t dst_offset = dst_block_offset + i;
    value_cache[dst_offset] = value_cache[src_offset];
  }
 }
@@ -102,15 +102,15 @@ void copy_blocks(
    value_cache_ptrs[layer_idx] = reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
  }
  // Create block mapping array.
-  std::vector<int> block_mapping_vec;
+  std::vector<int64_t> block_mapping_vec;
  for (const auto& pair : block_mapping) {
-    int src_block_number = pair.first;
+    int64_t src_block_number = pair.first;
-    for (int dst_block_number : pair.second) {
+    for (int64_t dst_block_number : pair.second) {
      block_mapping_vec.push_back(src_block_number);
      block_mapping_vec.push_back(dst_block_number);
    }
  }
-  int* block_mapping_array = block_mapping_vec.data();
+  int64_t* block_mapping_array = block_mapping_vec.data();
  int num_pairs = block_mapping_vec.size() / 2;
  // Move the data structures to the GPU.
@@ -120,7 +120,7 @@ void copy_blocks(
  torch::Tensor value_cache_ptrs_tensor = torch::from_blob(
    value_cache_ptrs, {num_layers}, torch::kInt64).to(cache_device);
  torch::Tensor block_mapping_tensor = torch::from_blob(
-    block_mapping_array, {2 * num_pairs}, torch::kInt).to(cache_device);
+    block_mapping_array, {2 * num_pairs}, torch::kInt64).to(cache_device);
  // Launch the kernel.
  const int numel_per_block = key_caches[0][0].numel();
@@ -132,7 +132,7 @@ void copy_blocks(
      vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
        key_cache_ptrs_tensor.data_ptr<int64_t>(),
        value_cache_ptrs_tensor.data_ptr<int64_t>(),
-        block_mapping_tensor.data_ptr<int>(),
+        block_mapping_tensor.data_ptr<int64_t>(),
        numel_per_block);
    }));
 }
@@ -141,43 +141,48 @@ namespace vllm {
 template<typename scalar_t>
 __global__ void reshape_and_cache_kernel(
-  const scalar_t* __restrict__ key,     // [num_tokens, num_heads, head_size]
+  const scalar_t* __restrict__ key,           // [num_tokens, num_heads, head_size]
-  const scalar_t* __restrict__ value,   // [num_tokens, num_heads, head_size]
+  const scalar_t* __restrict__ value,         // [num_tokens, num_heads, head_size]
-  scalar_t* __restrict__ key_cache,     // [num_blocks, num_heads, head_size/x, block_size, x]
+  scalar_t* __restrict__ key_cache,           // [num_blocks, num_heads, head_size/x, block_size, x]
-  scalar_t* __restrict__ value_cache,   // [num_blocks, num_heads, head_size, block_size]
+  scalar_t* __restrict__ value_cache,         // [num_blocks, num_heads, head_size, block_size]
-  const int* __restrict__ slot_mapping, // [num_tokens]
+  const int64_t* __restrict__ slot_mapping,   // [num_tokens]
  const int key_stride,
  const int value_stride,
  const int num_heads,
  const int head_size,
  const int block_size,
  const int x) {
-  const int token_idx = blockIdx.x;
+  const int64_t token_idx = blockIdx.x;
-  const int slot_idx = slot_mapping[token_idx];
+  const int64_t slot_idx = slot_mapping[token_idx];
-  const int block_idx = slot_idx / block_size;
+  if (slot_idx < 0) {
-  const int block_offset = slot_idx % block_size;
+    // Padding token that should be ignored.
    return;
  }
  const int64_t block_idx = slot_idx / block_size;
  const int64_t block_offset = slot_idx % block_size;
  const int n = num_heads * head_size;
  for (int i = threadIdx.x; i < n; i += blockDim.x) {
-    const int src_key_idx = token_idx * key_stride + i;
+    const int64_t src_key_idx = token_idx * key_stride + i;
-    const int src_value_idx = token_idx * value_stride + i;
+    const int64_t src_value_idx = token_idx * value_stride + i;
    const int head_idx = i / head_size;
    const int head_offset = i % head_size;
    const int x_idx = head_offset / x;
    const int x_offset = head_offset % x;
-    const int tgt_key_idx = block_idx * num_heads * (head_size / x) * block_size * x
+    const int64_t tgt_key_idx = block_idx * num_heads * (head_size / x) * block_size * x
-                            + head_idx * (head_size / x) * block_size * x
+                                + head_idx * (head_size / x) * block_size * x
-                            + x_idx * block_size * x
+                                + x_idx * block_size * x
-                            + block_offset * x
+                                + block_offset * x
-                            + x_offset;
+                                + x_offset;
-    const int tgt_value_idx = block_idx * num_heads * head_size * block_size
+    const int64_t tgt_value_idx = block_idx * num_heads * head_size * block_size
-                              + head_idx * head_size * block_size
+                                  + head_idx * head_size * block_size
-                              + head_offset * block_size
+                                  + head_offset * block_size
-                              + block_offset;
+                                  + block_offset;
-    key_cache[tgt_key_idx] = __ldg(&key[src_key_idx]);
+    key_cache[tgt_key_idx] = key[src_key_idx];
-    value_cache[tgt_value_idx] = __ldg(&value[src_value_idx]);
+    value_cache[tgt_value_idx] = value[src_value_idx];
  }
 }
@@ -211,7 +216,7 @@ void reshape_and_cache(
        value.data_ptr<scalar_t>(),
        key_cache.data_ptr<scalar_t>(),
        value_cache.data_ptr<scalar_t>(),
-        slot_mapping.data_ptr<int>(),
+        slot_mapping.data_ptr<int64_t>(),
        key_stride,
        value_stride,
        num_heads,
--- a/csrc/cuda_utils.cpp
+++ b/csrc/cuda_utils.cpp
@@ -1,13 +0,0 @@
 #include <torch/extension.h>
 int get_device_attribute(
    int attribute,
    int device_id);
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def(
    "get_device_attribute",
    &get_device_attribute,
    "Gets the specified device attribute.");
 }
--- a/csrc/cuda_utils.h
+++ b/csrc/cuda_utils.h
@@ -0,0 +1,5 @@
 #include <torch/extension.h>
 int get_device_attribute(
    int attribute,
    int device_id);
--- a/csrc/layernorm.cpp
+++ b/csrc/layernorm.cpp
@@ -1,14 +0,0 @@
 #include <torch/extension.h>
 void rms_norm(
  torch::Tensor& out,
  torch::Tensor& input,
  torch::Tensor& weight,
  float epsilon);
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def(
    "rms_norm",
    &rms_norm,
    "Apply Root Mean Square (RMS) Normalization to the input tensor.");
 }
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@@ -9,8 +9,8 @@ namespace vllm {
 // TODO(woosuk): Further optimize this kernel.
 template<typename scalar_t>
 __global__ void rms_norm_kernel(
-  scalar_t* __restrict__ out,             // [num_tokens, hidden_size]
+  scalar_t* __restrict__ out,             // [..., hidden_size]
-  const scalar_t* __restrict__ input,     // [num_tokens, hidden_size]
+  const scalar_t* __restrict__ input,     // [..., hidden_size]
  const scalar_t* __restrict__ weight,    // [hidden_size]
  const float epsilon,
  const int num_tokens,
@@ -34,15 +34,45 @@ __global__ void rms_norm_kernel(
  }
 }
 // TODO: Further optimize this kernel.
 template<typename scalar_t>
 __global__ void fused_add_rms_norm_kernel(
  scalar_t* __restrict__ input,           // [..., hidden_size]
  scalar_t* __restrict__ residual,        // [..., hidden_size]
  const scalar_t* __restrict__ weight,    // [hidden_size]
  const float epsilon,
  const int num_tokens,
  const int hidden_size) {
  __shared__ float s_variance;
  float variance = 0.0f;
  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
    float x = (float) input[blockIdx.x * hidden_size + idx];
    x += (float) residual[blockIdx.x * hidden_size + idx];
    variance += x * x;
    residual[blockIdx.x * hidden_size + idx] = (scalar_t) x;
  }
  variance = blockReduceSum<float>(variance);
  if (threadIdx.x == 0) {
    s_variance = rsqrtf(variance / hidden_size + epsilon);
  }
  __syncthreads();
  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
    float x = (float) residual[blockIdx.x * hidden_size + idx];
    input[blockIdx.x * hidden_size + idx] = ((scalar_t) (x * s_variance)) * weight[idx];
  }
 }
 } // namespace vllm
 void rms_norm(
-  torch::Tensor& out,      // [num_tokens, hidden_size]
+  torch::Tensor& out,      // [..., hidden_size]
-  torch::Tensor& input,    // [num_tokens, hidden_size]
+  torch::Tensor& input,    // [..., hidden_size]
  torch::Tensor& weight,   // [hidden_size]
  float epsilon) {
-  int num_tokens = input.size(0);
+  int hidden_size = input.size(-1);
-  int hidden_size = input.size(1);
+  int num_tokens = input.numel() / hidden_size;
  dim3 grid(num_tokens);
  dim3 block(std::min(hidden_size, 1024));
@@ -60,3 +90,28 @@ void rms_norm(
        hidden_size);
    });
 }
 void fused_add_rms_norm(
  torch::Tensor& input,    // [..., hidden_size]
  torch::Tensor& residual, // [..., hidden_size]
  torch::Tensor& weight,   // [hidden_size]
  float epsilon) {
  int hidden_size = input.size(-1);
  int num_tokens = input.numel() / hidden_size;
  dim3 grid(num_tokens);
  dim3 block(std::min(hidden_size, 1024));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(
    input.scalar_type(),
    "fused_add_rms_norm_kernel",
    [&] {
      vllm::fused_add_rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
        input.data_ptr<scalar_t>(),
        residual.data_ptr<scalar_t>(),
        weight.data_ptr<scalar_t>(),
        epsilon,
        num_tokens,
        hidden_size);
    });
 }
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -0,0 +1,75 @@
 #include <torch/extension.h>
 void paged_attention_v1(
  torch::Tensor& out,
  torch::Tensor& query,
  torch::Tensor& key_cache,
  torch::Tensor& value_cache,
  torch::Tensor& head_mapping,
  float scale,
  torch::Tensor& block_tables,
  torch::Tensor& context_lens,
  int block_size,
  int max_context_len,
  const c10::optional<torch::Tensor>& alibi_slopes);
 void paged_attention_v2(
  torch::Tensor& out,
  torch::Tensor& exp_sums,
  torch::Tensor& max_logits,
  torch::Tensor& tmp_out,
  torch::Tensor& query,
  torch::Tensor& key_cache,
  torch::Tensor& value_cache,
  torch::Tensor& head_mapping,
  float scale,
  torch::Tensor& block_tables,
  torch::Tensor& context_lens,
  int block_size,
  int max_context_len,
  const c10::optional<torch::Tensor>& alibi_slopes);
 void rms_norm(
  torch::Tensor& out,
  torch::Tensor& input,
  torch::Tensor& weight,
  float epsilon);
 void fused_add_rms_norm(
  torch::Tensor& input,
  torch::Tensor& residual,
  torch::Tensor& weight,
  float epsilon);
 void rotary_embedding(
  torch::Tensor& positions,
  torch::Tensor& query,
  torch::Tensor& key,
  int head_size,
  torch::Tensor& cos_sin_cache,
  bool is_neox);
 void silu_and_mul(
  torch::Tensor& out,
  torch::Tensor& input);
 void gelu_new(
  torch::Tensor& out,
  torch::Tensor& input);
 void gelu_fast(
  torch::Tensor& out,
  torch::Tensor& input);
 torch::Tensor awq_gemm(
  torch::Tensor _in_feats,
  torch::Tensor _kernel,
  torch::Tensor _scaling_factors,
  torch::Tensor _zeros,
  int split_k_iters);
 void squeezellm_gemm(
  torch::Tensor vec,
  torch::Tensor mat,
  torch::Tensor mul,
  torch::Tensor lookup_table);
--- a/csrc/pos_encoding.cpp
+++ b/csrc/pos_encoding.cpp
@@ -1,16 +0,0 @@
 #include <torch/extension.h>
 void rotary_embedding(
  torch::Tensor& positions,
  torch::Tensor& query,
  torch::Tensor& key,
  int head_size,
  torch::Tensor& cos_sin_cache,
  bool is_neox);
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def(
    "rotary_embedding",
    &rotary_embedding,
    "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
 }
--- a/csrc/pos_encoding_kernels.cu
+++ b/csrc/pos_encoding_kernels.cu
@@ -37,9 +37,9 @@ inline __device__ void apply_rotary_embedding(
 template<typename scalar_t, bool IS_NEOX>
 __global__ void rotary_embedding_kernel(
-  const int64_t* __restrict__ positions,        // [num_tokens]
+  const int64_t* __restrict__ positions,        // [batch_size, seq_len] or [num_tokens]
-  scalar_t* __restrict__ query,                 // [num_tokens, num_heads, head_size]
+  scalar_t* __restrict__ query,                 // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size]
-  scalar_t* __restrict__ key,                   // [num_tokens, num_kv_heads, head_size]
+  scalar_t* __restrict__ key,                   // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size]
  const scalar_t* __restrict__ cos_sin_cache,   // [max_position, 2, rot_dim // 2]
  const int rot_dim,
  const int query_stride,
@@ -78,18 +78,18 @@ __global__ void rotary_embedding_kernel(
 } // namespace vllm
 void rotary_embedding(
-  torch::Tensor& positions,         // [num_tokens]
+  torch::Tensor& positions,         // [batch_size, seq_len] or [num_tokens]
-  torch::Tensor& query,             // [num_tokens, num_heads * head_size]
+  torch::Tensor& query,             // [batch_size, seq_len, num_heads * head_size] or [num_tokens, num_heads * head_size]
-  torch::Tensor& key,               // [num_tokens, num_kv_heads * head_size]
+  torch::Tensor& key,               // [batch_size, seq_len, num_kv_heads * head_size] or [num_tokens, num_kv_heads * head_size]
  int head_size,
  torch::Tensor& cos_sin_cache,     // [max_position, rot_dim]
  bool is_neox) {
-  int num_tokens = query.size(0);
+  int64_t num_tokens = query.numel() / query.size(-1);
  int rot_dim = cos_sin_cache.size(1);
-  int num_heads = query.size(1) / head_size;
+  int num_heads = query.size(-1) / head_size;
-  int num_kv_heads = key.size(1) / head_size;
+  int num_kv_heads = key.size(-1) / head_size;
-  int query_stride = query.stride(0);
+  int query_stride = query.stride(-2);
-  int key_stride = key.stride(0);
+  int key_stride = key.stride(-2);
  dim3 grid(num_tokens);
  dim3 block(std::min(num_heads * rot_dim / 2, 512));
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -0,0 +1,80 @@
 #include "cache.h"
 #include "cuda_utils.h"
 #include "ops.h"
 #include <torch/extension.h>
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  // vLLM custom ops
  pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
  // Attention ops
  ops.def(
    "paged_attention_v1",
    &paged_attention_v1,
    "Compute the attention between an input query and the cached keys/values using PagedAttention.");
  ops.def(
    "paged_attention_v2",
    &paged_attention_v2,
    "PagedAttention V2.");
  // Activation ops
  ops.def(
    "silu_and_mul",
    &silu_and_mul,
    "Activation function used in SwiGLU.");
  ops.def(
    "gelu_new",
    &gelu_new,
    "GELU implementation used in GPT-2.");
  ops.def(
    "gelu_fast",
    &gelu_fast,
    "Approximate GELU implementation.");
  // Layernorm
  ops.def(
    "rms_norm",
    &rms_norm,
    "Apply Root Mean Square (RMS) Normalization to the input tensor.");
  ops.def(
    "fused_add_rms_norm",
    &fused_add_rms_norm,
    "In-place fused Add and RMS Normalization");
  // Rotary embedding
  ops.def(
    "rotary_embedding",
    &rotary_embedding,
    "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
  // Quantization ops
  ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
  ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
  // Cache ops
  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
  cache_ops.def(
    "swap_blocks",
    &swap_blocks,
    "Swap in (out) the cache blocks from src to dst");
  cache_ops.def(
    "copy_blocks",
    &copy_blocks,
    "Copy the cache blocks from src to dst");
  cache_ops.def(
    "reshape_and_cache",
    &reshape_and_cache,
    "Reshape the key and value tensors and cache them");
  cache_ops.def(
    "gather_cached_kv",
    &gather_cached_kv,
    "Gather key and value from the cache into contiguous QKV tensors");
  // Cuda utils
  pybind11::module cuda_utils = m.def_submodule("cuda_utils", "vLLM cuda utils");
  cuda_utils.def(
    "get_device_attribute",
    &get_device_attribute,
    "Gets the specified device attribute.");
 }
--- a/csrc/quantization.cpp
+++ b/csrc/quantization.cpp
@@ -1,15 +0,0 @@
 #include <torch/extension.h>
 torch::Tensor awq_gemm(
  torch::Tensor _in_feats,
  torch::Tensor _kernel,
  torch::Tensor _scaling_factors,
  torch::Tensor _zeros,
  int split_k_iters);
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def(
    "awq_gemm",
    &awq_gemm,
    "Quantized GEMM for AWQ");
 }
--- a/csrc/quantization/awq/dequantize.cuh
+++ b/csrc/quantization/awq/dequantize.cuh
@@ -16,7 +16,7 @@ namespace awq {
 __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source)
 {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
  assert(false);
 #else
    uint4 result;
--- a/csrc/quantization/awq/gemm_kernels.cu
+++ b/csrc/quantization/awq/gemm_kernels.cu
@@ -29,7 +29,7 @@ __pack_half2(const half x, const half y) {
 __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, int split_k_iters, half* __restrict__ A, int* __restrict__ B, half* __restrict__ scaling_factors, int* __restrict__ zeros, int M, int IC, int OC, half* __restrict__ C) 
 {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
  assert(false);
 #else
  static constexpr uint32_t ZERO = 0x0;
@@ -90,7 +90,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i
                            + (((int)threadIdx.x) % (128 / 8)) * 8;
  half* C_ptr = C 
-              + blockIdx_z * M * OC        // blockIdz.x -> split_k dim
+              + static_cast<long long>(blockIdx_z) * M * OC        // blockIdz.x -> split_k dim
              + (((int)blockIdx_y) % j_factors1) * 128
              + ((int)threadIdx.y) * 64
              + (((int)threadIdx.x) % 4) * 2;
@@ -191,6 +191,39 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i
        }
      }
      for (int j_0_4 = 0; j_0_4 < 4; ++j_0_4) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
        {
          __asm__ __volatile__(
            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
            :  "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3])
            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3]));
        }
        {
          __asm__ __volatile__(
            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
            :  "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])
            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]));
        }
        {
          __asm__ __volatile__(
            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
            :  "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3])
            : "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3]));
        }
        {
          __asm__ __volatile__(
            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
            :  "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])
            : "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]));
        }
 #else
        {
          __asm__ __volatile__(
            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
@@ -206,6 +239,8 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i
            :  "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])
            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]));
        }
 #endif
      }
    }
  }
@@ -226,7 +261,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i
 __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, int split_k_iters, half* __restrict__ A, int* __restrict__ B, half* __restrict__ scaling_factors, int* __restrict__ zeros, int M, int IC, int OC, half* __restrict__ C) 
 {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
  assert(false);
 #else
  static constexpr uint32_t ZERO = 0x0;
@@ -288,7 +323,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in
                            + (((int)threadIdx.x) % (64 / 8)) * 8;
  half* C_ptr = C 
-              + blockIdx_z * M * OC        // blockIdz.x -> split_k dim
+              + static_cast<long long>(blockIdx_z) * M * OC        // blockIdz.x -> split_k dim
              + (((int)blockIdx_y) % j_factors1) * 64
              + ((int)threadIdx.y) * 32
              + (((int)threadIdx.x) % 4) * 2;
@@ -392,7 +427,39 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in
      for (int j_0_4 = 0; j_0_4 < 2; ++j_0_4) 
      {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
        {
          __asm__ __volatile__(
            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
            :  "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3])
            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3]));
        }
        {
          __asm__ __volatile__(
            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
            :  "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])
            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]));
        }
        {
          __asm__ __volatile__(
            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
            :  "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3])
            : "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3]));
        }
        {
          __asm__ __volatile__(
            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
            :  "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])
            : "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]));
        }
 #else
        {
          __asm__ __volatile__(
            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
@@ -408,6 +475,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in
            :  "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])
            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]));
        }
 #endif
      }
    }
  }
@@ -466,6 +534,7 @@ torch::Tensor awq_gemm(
    if (num_out_channels % group_size != 0)
        throw std::invalid_argument("OC is not multiple of Group size");
    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    if (num_out_channels % 128 == 0)
    {
        int j_factors1 = num_out_channels / 128 / 1;
@@ -473,18 +542,18 @@ torch::Tensor awq_gemm(
        // threadIdx.x: 32
        // threadIdx.y: i_factors[2] * j_factors[2]
        dim3 threads_per_block(32, 2);
-        vllm::awq::gemm_forward_4bit_cuda_m16n128k32<<<num_blocks, threads_per_block>>>(
+        vllm::awq::gemm_forward_4bit_cuda_m16n128k32<<<num_blocks, threads_per_block, 0, stream>>>(
            group_size, split_k_iters, in_feats, kernel, scaling_factors, zeros, num_in_feats, num_in_channels, num_out_channels, out_feats);
    }
    else if (num_out_channels % 64 == 0)
    {
-	int j_factors1 = num_out_channels / 64 / 1;
+        int j_factors1 = num_out_channels / 64 / 1;
        dim3 num_blocks(1 * (num_out_feats + 16 - 1) / 16 * j_factors1 * split_k_iters);
        // threadIdx.x: 32
        // threadIdx.y: i_factors[2] * j_factors[2]
        dim3 threads_per_block(32, 2);
-        vllm::awq::gemm_forward_4bit_cuda_m16n64k32<<<num_blocks, threads_per_block>>>(
+        vllm::awq::gemm_forward_4bit_cuda_m16n64k32<<<num_blocks, threads_per_block, 0, stream>>>(
            group_size, split_k_iters, in_feats, kernel, scaling_factors, zeros, num_in_feats, num_in_channels, num_out_channels, out_feats);
    }
    return _out_feats.sum(0);
--- a/csrc/quantization/squeezellm/quant_cuda_kernel.cu
+++ b/csrc/quantization/squeezellm/quant_cuda_kernel.cu
@@ -0,0 +1,148 @@
 #include <torch/all.h>
 #include <torch/python.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
 // half-tensor
 #include <c10/cuda/CUDAStream.h>
 #include <ATen/cuda/CUDATensorMethods.cuh>
 #define BLOCKWIDTH 128
 #define BLOCKHEIGHT4 16
 namespace vllm {
 namespace squeezellm {
 __device__ inline unsigned int as_unsigned(int i) {
  return *reinterpret_cast<unsigned int*>(&i);
 }
 // 4-bit matvec kernel (LUT-based)
 __global__ void NUQ4MatMulKernel(
    const  half2* __restrict__ vec,
    const    int* __restrict__ mat,
           half2* __restrict__ mul,
    const  __half* __restrict__ lookup_table,
    int height,
    int width,
    int batch,
    int vec_height
 ) {
  const int blockwidth2 = BLOCKWIDTH / 2;
  int row = BLOCKHEIGHT4 * blockIdx.x;
  int col =  BLOCKWIDTH * blockIdx.y + threadIdx.x;
  __shared__ half2 blockvec[blockwidth2];
  __shared__ __half deq2[16][BLOCKWIDTH];
  int off = threadIdx.x;
  int column_offset = col * 16;
  for (int val = 0; val < 16; val += 1) {
    int lut_index = column_offset + val;
    deq2[val][off] = lookup_table[lut_index];
  }
  __half res;
  half2 res2;
  half2 tmp2;
  int i;
  int k;
  unsigned int tmp1;
  unsigned int lut_index1, lut_index2;
  for (int b = 0; b < batch; ++b){
    i = width * row + col;
    res = __int2half_rd(0);
    k = 0;
    __syncthreads();
    if (threadIdx.x < blockwidth2)
      blockvec[threadIdx.x] = vec[b * vec_height / 2 + (row / BLOCKHEIGHT4) * blockwidth2 + threadIdx.x];
    __syncthreads();
    while (k < blockwidth2) {
      tmp1 = as_unsigned(mat[i]);
      res2 = {};
      tmp2 = {};
      lut_index1 = tmp1 & 0xF;
      lut_index2 = (tmp1 >> 4) & 0xF;
      tmp2.x = deq2[lut_index1][off];
      tmp2.y = deq2[lut_index2][off];
      res2 = __hfma2(tmp2, blockvec[k + 0], res2);
      lut_index1 = (tmp1 >> 8) & 0xF;
      lut_index2 = (tmp1 >> 12) & 0xF;
      tmp2.x = deq2[lut_index1][off];
      tmp2.y = deq2[lut_index2][off];
      res2 = __hfma2(tmp2, blockvec[k + 1], res2);
      lut_index1 = (tmp1 >> 16) & 0xF;
      lut_index2 = (tmp1 >> 20) & 0xF;
      tmp2.x = deq2[lut_index1][off];
      tmp2.y = deq2[lut_index2][off];
      res2 = __hfma2(tmp2, blockvec[k + 2], res2);
      lut_index1 = (tmp1 >> 24) & 0xF;
      lut_index2 = (tmp1 >> 28) & 0xF;
      tmp2.x = deq2[lut_index1][off];
      tmp2.y = deq2[lut_index2][off];
      res2 = __hfma2(tmp2, blockvec[k + 3], res2);
      res = __hadd(__hadd(res2.x, res2.y), res);
      i += width;
      k += 4;
    }
    // col%2 -> only set one of the two values
    half2 res3 = {};
    if (col % 2 == 0) {
      res3.x = res;
    } else {
      res3.y = res;
    }
    atomicAdd(&mul[b * width / 2 + col / 2], res3);
  }
 }
 } // namespace squeezellm
 } // namespace vllm
 // 4-bit matvec kernel (LUT-based)
 void squeezellm_gemm(
  torch::Tensor vec,
  torch::Tensor mat,
  torch::Tensor mul,
  torch::Tensor lookup_table
 ) {
  int height = mat.size(0);
  int width = mat.size(1);
  int batch = vec.size(0);
  int vec_height = vec.size(1);
  dim3 blocks(
    (height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4,
    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
  );
  dim3 threads(BLOCKWIDTH);
  vllm::squeezellm::NUQ4MatMulKernel<<<blocks, threads>>>(
    (half2*) vec.data<at::Half>(),
    mat.data_ptr<int>(),
    (half2*) mul.data<at::Half>(),
    (__half*) lookup_table.data<at::Half>(),
    height, width, batch, vec_height
  );
 }
 #undef BLOCKWIDTH
 #undef BLOCKHEIGHT4
--- a/docs/source/assets/figures/perf_a100_n1_dark.png
+++ b/docs/source/assets/figures/perf_a100_n1_dark.png
--- a/docs/source/assets/figures/perf_a100_n1_light.png
+++ b/docs/source/assets/figures/perf_a100_n1_light.png
--- a/docs/source/assets/figures/perf_a100_n3_dark.png
+++ b/docs/source/assets/figures/perf_a100_n3_dark.png
--- a/docs/source/assets/figures/perf_a100_n3_light.png
+++ b/docs/source/assets/figures/perf_a100_n3_light.png
--- a/docs/source/assets/figures/perf_a10g_n1_dark.png
+++ b/docs/source/assets/figures/perf_a10g_n1_dark.png
--- a/docs/source/assets/figures/perf_a10g_n1_light.png
+++ b/docs/source/assets/figures/perf_a10g_n1_light.png
--- a/docs/source/assets/figures/perf_a10g_n3_dark.png
+++ b/docs/source/assets/figures/perf_a10g_n3_dark.png
--- a/docs/source/assets/figures/perf_a10g_n3_light.png
+++ b/docs/source/assets/figures/perf_a10g_n3_light.png
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -3,14 +3,14 @@
 Installation
 ============
-vLLM is a Python library that also contains pre-compiled C++ and CUDA (11.8) binaries.
+vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries.
 Requirements
 ------------
 * OS: Linux
 * Python: 3.8 -- 3.11
-* GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, etc.)
+* GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
 Install with pip
 ----------------
@@ -23,9 +23,24 @@ You can install vLLM using pip:
    $ conda create -n myenv python=3.8 -y
    $ conda activate myenv
-    $ # Install vLLM.
+    $ # Install vLLM with CUDA 12.1.
    $ pip install vllm
 .. note::
    As of now, vLLM's binaries are compiled on CUDA 12.1 by default.
    However, you can install vLLM with CUDA 11.8 by running:
    .. code-block:: console
        $ # Install vLLM with CUDA 11.8.
        $ # Replace `cp310` with your Python version (e.g., `cp38`, `cp39`, `cp311`).
        $ pip install https://github.com/vllm-project/vllm/releases/download/v0.2.2/vllm-0.2.2+cu118-cp310-cp310-manylinux1_x86_64.whl
        $ # Re-install PyTorch with CUDA 11.8.
        $ pip uninstall torch -y
        $ pip install torch --upgrade --index-url https://download.pytorch.org/whl/cu118
 .. _build_from_source:
@@ -45,6 +60,5 @@ You can also build and install vLLM from source:
    .. code-block:: console
        $ # Pull the Docker image with CUDA 11.8.
        $ # Use `--ipc=host` to make sure the shared memory is large enough.
-        $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:22.12-py3
+        $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
@@ -40,6 +40,16 @@ Initialize vLLM's engine for offline inference with the ``LLM`` class and the `O
    llm = LLM(model="facebook/opt-125m")
 Use model from www.modelscope.cn
 .. code-block:: shell
    export VLLM_USE_MODELSCOPE=True
 .. code-block:: python
    llm = LLM(model="qwen/Qwen-7B-Chat", revision="v1.1.8", trust_remote_code=True)
 Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all the output tokens.
 .. code-block:: python
@@ -67,6 +77,16 @@ Start the server:
    $ python -m vllm.entrypoints.api_server
 Use model from www.modelscope.cn
 .. code-block:: console
    $ VLLM_USE_MODELSCOPE=True python -m vllm.entrypoints.api_server \
    $    --model="qwen/Qwen-7B-Chat" \
    $    --revision="v1.1.8" \
    $    --trust-remote-code
 By default, this command starts the server at ``http://localhost:8000`` with the OPT-125M model.
 Query the model in shell:
@@ -87,6 +107,7 @@ OpenAI-Compatible Server
 ------------------------
 vLLM can be deployed as a server that mimics the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API.
 By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the above command) and implements `list models <https://platform.openai.com/docs/api-reference/models/list>`_, `create chat completion <https://platform.openai.com/docs/api-reference/chat/completions/create>`_, and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. We are actively adding support for more endpoints.
 Start the server:
@@ -95,7 +116,20 @@ Start the server:
    $ python -m vllm.entrypoints.openai.api_server \
    $     --model facebook/opt-125m
-By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the above command) and implements `list models <https://platform.openai.com/docs/api-reference/models/list>`_ and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. We are actively adding support for more endpoints.
+Use model from www.modelscope.cn
 .. code-block:: console
    $ VLLM_USE_MODELSCOPE=True python -m vllm.entrypoints.openai.api_server \
    $     --model="qwen/Qwen-7B-Chat" --revision="v1.1.8" --trust-remote-code
 By default, the server uses a predefined chat template stored in the tokenizer. You can override this template by using the ``--chat-template`` argument:
 .. code-block:: console
   $ python -m vllm.entrypoints.openai.api_server \
   $     --model facebook/opt-125m \
   $     --chat-template ./examples/template_chatml.json
 This server can be queried in the same format as OpenAI API. For example, list the models:
@@ -103,6 +137,9 @@ This server can be queried in the same format as OpenAI API. For example, list t
    $ curl http://localhost:8000/v1/models
 Using OpenAI Completions API with vLLM
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Query the model with input prompts:
 .. code-block:: console
@@ -120,12 +157,65 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
 .. code-block:: python
-    import openai
+    from openai import OpenAI
    # Modify OpenAI's API key and API base to use vLLM's API server.
-    openai.api_key = "EMPTY"
+    openai_api_key = "EMPTY"
-    openai.api_base = "http://localhost:8000/v1"
+    openai_api_base = "http://localhost:8000/v1"
-    completion = openai.Completion.create(model="facebook/opt-125m",
+    client = OpenAI(
        api_key=openai_api_key,
        base_url=openai_api_base,
    )
    completion = client.completions.create(model="facebook/opt-125m",
                                          prompt="San Francisco is a")
    print("Completion result:", completion)
 For a more detailed client example, refer to `examples/openai_completion_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py>`_.
 Using OpenAI Chat API with vLLM
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 The vLLM server is designed to support the OpenAI Chat API, allowing you to engage in dynamic conversations with the model. The chat interface is a more interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations.
 Querying the model using OpenAI Chat API:
 You can use the `create chat completion <https://platform.openai.com/docs/api-reference/chat/completions/create>`_ endpoint to communicate with the model in a chat-like interface:
 .. code-block:: console
    $ curl http://localhost:8000/v1/chat/completions \
    $     -H "Content-Type: application/json" \
    $     -d '{
    $         "model": "facebook/opt-125m",
    $         "messages": [
    $             {"role": "system", "content": "You are a helpful assistant."},
    $             {"role": "user", "content": "Who won the world series in 2020?"}
    $         ]
    $     }'
 Python Client Example:
 Using the `openai` python package, you can also communicate with the model in a chat-like manner:
 .. code-block:: python
    from openai import OpenAI
    # Set OpenAI's API key and API base to use vLLM's API server.
    openai_api_key = "EMPTY"
    openai_api_base = "http://localhost:8000/v1"
    client = OpenAI(
        api_key=openai_api_key,
        base_url=openai_api_base,
    )
    chat_response = client.chat.completions.create(
        model="facebook/opt-125m",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Tell me a joke."},
        ]
    )
    print("Chat response:", chat_response)
 For more in-depth examples and advanced features of the chat API, you can refer to the official OpenAI documentation.
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -65,6 +65,9 @@ Documentation
   serving/distributed_serving
   serving/run_on_sky
   serving/deploying_with_triton
   serving/deploying_with_docker
   serving/serving_with_langchain
   serving/metrics
 .. toctree::
   :maxdepth: 1
@@ -72,3 +75,10 @@ Documentation
   models/supported_models
   models/adding_model
   models/engine_args
 .. toctree::
   :maxdepth: 1
   :caption: Quantization
   quantization/auto_awq
--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -18,7 +18,7 @@ This document provides a high-level guide on integrating a `HuggingFace Transfor
 0. Fork the vLLM repository
 --------------------------------
-Start by forking our `GitHub <https://github.com/vllm-project/vllm/>`_ repository and then :ref:`build it from source <build_from_source>`.
+Start by forking our `GitHub`_ repository and then :ref:`build it from source <build_from_source>`.
 This gives you the ability to modify the codebase and test your model.
@@ -62,31 +62,34 @@ Next, you need to rewrite the :code:`forward` methods of your model by following
    +) -> SamplerOutput:
 3. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors.
-4. Replace the attention operation with either :code:`GPTPagedAttention` or :code:`GPTNeoXPagedAttention`, depending on the model's architecture.
+4. Replace the attention operation with either :code:`PagedAttention`, :code:`PagedAttentionWithRoPE`, or :code:`PagedAttentionWithALiBi` depending on the model's architecture.
 .. note::
    Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
    If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
-3. (Optional) Implement tensor parallelism support
+3. (Optional) Implement tensor parallelism and quantization support
--------------------------------------------------
+-------------------------------------------------------------------
 If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
 To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
-For the embedding layer, you can simply replace :code:`nn.Embedding` with :code:`VocabParallelEmbedding`.
+For the embedding layer, you can simply replace :code:`nn.Embedding` with :code:`VocabParallelEmbedding`. For the output LM head, you can use :code:`ParallelLMHead`.
-When it comes to the linear layers, you should use either :code:`RowParallelLinear` or :code:`ColumnParallelLinear`.
+When it comes to the linear layers, we provide the following options to parallelize them:
 Typically, :code:`ColumnParallelLinear` is used for QKV linear layers and the first linear layers of the MLP blocks.
 For the remaining linear layers, :code:`RowParallelLinear` is used.
 * :code:`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
 * :code:`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer.
 * :code:`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer.
 * :code:`MergedColumnParallelLinear`: Column-parallel linear that merges multiple `ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
 * :code:`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices.
 Note that all the linear layers above take `linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
 4. Implement the weight loading logic
 -------------------------------------
 You now need to implement the :code:`load_weights` method in your :code:`*ForCausalLM` class.
-This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model.
+This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for `MergedColumnParallelLinear` and `QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
 While the process is straightforward for most layers, the tensor-parallel layers necessitate some additional care as their weights should be partitioned to multiple GPUs.
 5. Register your model
 ----------------------
--- a/docs/source/models/engine_args.rst
+++ b/docs/source/models/engine_args.rst
@@ -0,0 +1,114 @@
 .. _engine_args:
 Engine Arguments
 ================
 Below, you can find an explanation of every engine argument for vLLM:
 .. option:: --model <model_name_or_path>
    Name or path of the huggingface model to use.
 .. option:: --tokenizer <tokenizer_name_or_path>
    Name or path of the huggingface tokenizer to use.
 .. option:: --revision <revision>
    The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
 .. option:: --tokenizer-revision <revision>
    The specific tokenizer version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
 .. option:: --tokenizer-mode {auto,slow}
    The tokenizer mode.
    * "auto" will use the fast tokenizer if available.
    * "slow" will always use the slow tokenizer.
 .. option:: --trust-remote-code
    Trust remote code from huggingface.
 .. option:: --download-dir <directory>
    Directory to download and load the weights, default to the default cache dir of huggingface.
 .. option:: --load-format {auto,pt,safetensors,npcache,dummy}
    The format of the model weights to load.
    * "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is not available.
    * "pt" will load the weights in the pytorch bin format.
    * "safetensors" will load the weights in the safetensors format.
    * "npcache" will load the weights in pytorch format and store a numpy cache to speed up the loading.
    * "dummy" will initialize the weights with random values, mainly for profiling.
 .. option:: --dtype {auto,half,float16,bfloat16,float,float32}
    Data type for model weights and activations.
    * "auto" will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models.
    * "half" for FP16. Recommended for AWQ quantization.
    * "float16" is the same as "half".
    * "bfloat16" for a balance between precision and range.
    * "float" is shorthand for FP32 precision.
    * "float32" for FP32 precision.
 .. option:: --max-model-len <length>
    Model context length. If unspecified, will be automatically derived from the model config.
 .. option:: --worker-use-ray
    Use Ray for distributed serving, will be automatically set when using more than 1 GPU.
 .. option:: --pipeline-parallel-size (-pp) <size>
    Number of pipeline stages.
 .. option:: --tensor-parallel-size (-tp) <size>
    Number of tensor parallel replicas.
 .. option:: --max-parallel-loading-workers <workers>
    Load model sequentially in multiple batches, to avoid RAM OOM when using tensor parallel and large models.
 .. option:: --block-size {8,16,32}
    Token block size for contiguous chunks of tokens.
 .. option:: --seed <seed>
    Random seed for operations.
 .. option:: --swap-space <size>
    CPU swap space size (GiB) per GPU.
 .. option:: --gpu-memory-utilization <percentage>
    The percentage of GPU memory to be used for the model executor.
 .. option:: --max-num-batched-tokens <tokens>
    Maximum number of batched tokens per iteration.
 .. option:: --max-num-seqs <sequences>
    Maximum number of sequences per iteration.
 .. option:: --max-paddings <paddings>
    Maximum number of paddings in a batch.
 .. option:: --disable-log-stats
    Disable logging statistics.
 .. option:: --quantization (-q) {awq,squeezellm,None}
    Method used to quantize the weights.
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -19,13 +19,16 @@ Alongside each architecture, we include some popular models that use it.
    - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc.
  * - :code:`BaiChuanForCausalLM`
    - Baichuan
-    - :code:`baichuan-inc/Baichuan-7B`, :code:`baichuan-inc/Baichuan-13B-Chat`, etc.
+    - :code:`baichuan-inc/Baichuan2-13B-Chat`, :code:`baichuan-inc/Baichuan-7B`, etc.
  * - :code:`ChatGLMModel`
    - ChatGLM
    - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc.
  * - :code:`BloomForCausalLM`
    - BLOOM, BLOOMZ, BLOOMChat
    - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc.
  * - :code:`FalconForCausalLM`
    - Falcon
-    - :code:`tiiuae/falcon-7b``, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
+    - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
  * - :code:`GPT2LMHeadModel`
    - GPT-2
    - :code:`gpt2`, :code:`gpt2-xl`, etc.
@@ -53,9 +56,15 @@ Alongside each architecture, we include some popular models that use it.
  * - :code:`OPTForCausalLM`
    - OPT, OPT-IML
    - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.
  * - :code:`PhiForCausalLM`
    - Phi-1.5
    - :code:`microsoft/phi-1_5`, etc.
  * - :code:`QWenLMHeadModel`
    - Qwen
    - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
  * - :code:`YiForCausalLM`
    - Yi
    - :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
 Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` for instructions on how to implement support for your model.
@@ -72,4 +81,18 @@ Alternatively, you can raise an issue on our `GitHub <https://github.com/vllm-pr
        output = llm.generate("Hello, my name is")
        print(output)
    To use model from www.modelscope.cn
    .. code-block:: shell
       $ export VLLM_USE_MODELSCOPE=True
    .. code-block:: python
        from vllm import LLM
        llm = LLM(model=..., revision=..., trust_remote_code=True)  # Name or path of your model
        output = llm.generate("Hello, my name is")
        print(output)
    If vLLM successfully generates text, it indicates that your model is supported.
--- a/docs/source/quantization/auto_awq.rst
+++ b/docs/source/quantization/auto_awq.rst
@@ -0,0 +1,75 @@
 .. _auto_awq:
 AutoAWQ
 ==================
 .. warning::
   Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better
   accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency
   inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version.
 To create a new 4-bit quantized model, you can leverage `AutoAWQ <https://github.com/casper-hansen/AutoAWQ>`_. 
 Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%.
 The main benefits are lower latency and memory usage.
 You can quantize your own models by installing AutoAWQ or picking one of the `400+ models on Huggingface <https://huggingface.co/models?sort=trending&search=awq>`_. 
 .. code-block:: console
    $ pip install autoawq
 After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize Vicuna 7B v1.5:
 .. code-block:: python
    from awq import AutoAWQForCausalLM
    from transformers import AutoTokenizer
    model_path = 'lmsys/vicuna-7b-v1.5'
    quant_path = 'vicuna-7b-v1.5-awq'
    quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
    # Load model
    model = AutoAWQForCausalLM.from_pretrained(model_path, **{"low_cpu_mem_usage": True})
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    # Quantize
    model.quantize(tokenizer, quant_config=quant_config)
    # Save quantized model
    model.save_quantized(quant_path)
    tokenizer.save_pretrained(quant_path)
 To run an AWQ model with vLLM, you can use `TheBloke/Llama-2-7b-Chat-AWQ <https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ>`_ with the following command:
 .. code-block:: console
    $ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
 AWQ models are also supported directly through the LLM entrypoint:
 .. code-block:: python
    from vllm import LLM, SamplingParams
    # Sample prompts.
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
    ]
    # Create a sampling params object.
    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
    # Create an LLM.
    llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
    # Generate texts from the prompts. The output is a list of RequestOutput objects
    # that contain the prompt, generated text, and other information.
    outputs = llm.generate(prompts, sampling_params)
    # Print the outputs.
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -0,0 +1,43 @@
 .. _deploying_with_docker:
 Deploying with Docker
 ============================
 vLLM offers official docker image for deployment.
 The image can be used to run OpenAI compatible server.
 The image is available on Docker Hub as `vllm/vllm-openai <https://hub.docker.com/r/vllm/vllm-openai/tags>`_.
 .. code-block:: console
    $ docker run --runtime nvidia --gpus all \
        -v ~/.cache/huggingface:/root/.cache/huggingface \
        --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
        -p 8000:8000 \
        --ipc=host \
        vllm/vllm-openai:latest \
        --model mistralai/Mistral-7B-v0.1
 .. note::
        You can either use the ``ipc=host`` flag or ``--shm-size`` flag to allow the
        container to access the host's shared memory. vLLM uses PyTorch, which uses shared
        memory to share data between processes under the hood, particularly for tensor parallel inference.
 You can build and run vLLM from source via the provided dockerfile. To build vLLM:
 .. code-block:: console
    $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai --build-arg max_jobs=8
 To run vLLM:
 .. code-block:: console
    $ docker run --runtime nvidia --gpus all \
        -v ~/.cache/huggingface:/root/.cache/huggingface \
        -p 8000:8000 \
        --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
        vllm/vllm-openai <args...>
--- a/docs/source/serving/metrics.rst
+++ b/docs/source/serving/metrics.rst
@@ -0,0 +1,13 @@
 Production Metrics
 ==================
 vLLM exposes a number of metrics that can be used to monitor the health of the
 system. These metrics are exposed via the `/metrics` endpoint on the vLLM
 OpenAI compatible API server.
 The following metrics are exposed:
 .. literalinclude:: ../../../vllm/engine/metrics.py
    :language: python
    :start-after: begin-metrics-definitions
    :end-before: end-metrics-definitions
--- a/docs/source/serving/serving_with_langchain.rst
+++ b/docs/source/serving/serving_with_langchain.rst
@@ -0,0 +1,31 @@
 .. _run_on_langchain:
 Serving with Langchain
 ============================
 vLLM is also available via `Langchain <https://github.com/langchain-ai/langchain>`_ .
 To install langchain, run
 .. code-block:: console
    $ pip install langchain -q
 To run inference on a single or multiple GPUs, use ``VLLM`` class from ``langchain``.
 .. code-block:: python
    from langchain.llms import VLLM
    llm = VLLM(model="mosaicml/mpt-7b",
               trust_remote_code=True,  # mandatory for hf models
               max_new_tokens=128,
               top_k=10,
               top_p=0.95,
               temperature=0.8,
               # tensor_parallel_size=... # for distributed inference
    )
    print(llm("What is the capital of France ?"))
 Please refer to this `Tutorial <https://github.com/langchain-ai/langchain/blob/master/docs/extras/integrations/llms/vllm.ipynb>`_ for more details.
--- a/examples/gradio_webserver.py
+++ b/examples/gradio_webserver.py
@@ -39,7 +39,7 @@ def build_demo():
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--host", type=str, default=None)
    parser.add_argument("--port", type=int, default=8001)
    parser.add_argument("--model-url",
                        type=str,
--- a/examples/llm_engine_example.py
+++ b/examples/llm_engine_example.py
@@ -1,17 +1,14 @@
 import argparse
 from typing import List, Tuple
-from vllm import EngineArgs, LLMEngine, SamplingParams
+from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput
-def main(args: argparse.Namespace):
+def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
-    # Parse the CLI argument and initialize the engine.
+    """Create a list of test prompts with their sampling parameters."""
-    engine_args = EngineArgs.from_cli_args(args)
+    return [
    engine = LLMEngine.from_engine_args(engine_args)
    # Test the following prompts.
    test_prompts = [
        ("A robot may not injure a human being",
-         SamplingParams(temperature=0.0)),
+         SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1)),
        ("To be or not to be,",
         SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
        ("What is the meaning of life?",
@@ -25,22 +22,36 @@ def main(args: argparse.Namespace):
                        temperature=0.0)),
    ]
-    # Run the engine by calling `engine.step()` manually.
+
 def process_requests(engine: LLMEngine,
                     test_prompts: List[Tuple[str, SamplingParams]]):
    """Continuously process a list of prompts and handle the outputs."""
    request_id = 0
-    while True:
+
-        # To test continuous batching, we add one request at each step.
+    while test_prompts or engine.has_unfinished_requests():
        if test_prompts:
            prompt, sampling_params = test_prompts.pop(0)
            engine.add_request(str(request_id), prompt, sampling_params)
            request_id += 1
-        request_outputs = engine.step()
+        request_outputs: List[RequestOutput] = engine.step()
        for request_output in request_outputs:
            if request_output.finished:
                print(request_output)
-        if not (engine.has_unfinished_requests() or test_prompts):
+
-            break
+def initialize_engine(args: argparse.Namespace) -> LLMEngine:
    """Initialize the LLMEngine from the command line arguments."""
    engine_args = EngineArgs.from_cli_args(args)
    return LLMEngine.from_engine_args(engine_args)
 def main(args: argparse.Namespace):
    """Main function that sets up and runs the prompt processing."""
    engine = initialize_engine(args)
    test_prompts = create_test_prompts()
    process_requests(engine, test_prompts)
 if __name__ == '__main__':
--- a/examples/openai_chatcompletion_client.py
+++ b/examples/openai_chatcompletion_client.py
@@ -1,18 +1,19 @@
-import openai
+from openai import OpenAI
 # Modify OpenAI's API key and API base to use vLLM's API server.
-openai.api_key = "EMPTY"
+openai_api_key = "EMPTY"
-openai.api_base = "http://localhost:8000/v1"
+openai_api_base = "http://localhost:8000/v1"
-# List models API
+client = OpenAI(
-models = openai.Model.list()
+    # defaults to os.environ.get("OPENAI_API_KEY")
-print("Models:", models)
+    api_key=openai_api_key,
    base_url=openai_api_base,
 )
-model = models["data"][0]["id"]
+models = client.models.list()
 model = models.data[0].id
-# Chat completion API
+chat_completion = client.chat.completions.create(
 chat_completion = openai.ChatCompletion.create(
    model=model,
    messages=[{
        "role": "system",
        "content": "You are a helpful assistant."
@@ -27,7 +28,10 @@ chat_completion = openai.ChatCompletion.create(
    }, {
        "role": "user",
        "content": "Where was it played?"
-    }])
+    }],
    model=model,
 )
 print("Chat completion results:")
 print(chat_completion)
--- a/examples/openai_completion_client.py
+++ b/examples/openai_completion_client.py
@@ -1,24 +1,28 @@
-import openai
+from openai import OpenAI
 # Modify OpenAI's API key and API base to use vLLM's API server.
-openai.api_key = "EMPTY"
+openai_api_key = "EMPTY"
-openai.api_base = "http://localhost:8000/v1"
+openai_api_base = "http://localhost:8000/v1"
-# List models API
+client = OpenAI(
-models = openai.Model.list()
+    # defaults to os.environ.get("OPENAI_API_KEY")
-print("Models:", models)
+    api_key=openai_api_key,
    base_url=openai_api_base,
 )
-model = models["data"][0]["id"]
+models = client.models.list()
 model = models.data[0].id
 # Completion API
 stream = False
-completion = openai.Completion.create(
+completion = client.completions.create(
    model=model,
    prompt="A robot may not injure a human being",
    echo=False,
    n=2,
    stream=stream,
-    logprobs=3)
+    logprobs=3
 )
 print("Completion results:")
 if stream:
--- a/examples/template_alpaca.jinja
+++ b/examples/template_alpaca.jinja
@@ -0,0 +1,29 @@
 {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 {% for message in messages %}
 {% if message['role'] == 'user' %}
 ### Instruction:
 {{ message['content']|trim -}}
 {% if not loop.last %}
 {% endif %}
 {% elif message['role'] == 'assistant' %}
 ### Response:
 {{ message['content']|trim -}}
 {% if not loop.last %}
 {% endif %}
 {% elif message['role'] == 'user_context' %}
 ### Input:
 {{ message['content']|trim -}}
 {% if not loop.last %}
 {% endif %}
 {% endif %}
 {% endfor %}
 {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
 ### Response:
 {% endif %}
--- a/examples/template_chatml.jinja
+++ b/examples/template_chatml.jinja
@@ -0,0 +1,2 @@
 {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
 {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
--- a/examples/template_inkbot.jinja
+++ b/examples/template_inkbot.jinja
@@ -0,0 +1,30 @@
 <#meta#>
 - Date: {{ (messages|selectattr('role', 'equalto', 'meta-current_date')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-current_date')|list) else '' }}
 - Task: {{ (messages|selectattr('role', 'equalto', 'meta-task_name')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-task_name')|list) else '' }}
 <#system#>
 {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 <#chat#>
 {% for message in messages %}
 {% if message['role'] == 'user' %}
 <#user#>
 {{ message['content']|trim -}}
 {% if not loop.last %}
 {% endif %}
 {% elif message['role'] == 'assistant' %}
 <#bot#>
 {{ message['content']|trim -}}
 {% if not loop.last %}
 {% endif %}
 {% elif message['role'] == 'user_context' %}
 <#user_context#>
 {{ message['content']|trim -}}
 {% if not loop.last %}
 {% endif %}
 {% endif %}
 {% endfor %}
 {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
 <#bot#>
 {% endif %}
--- a/format.sh
+++ b/format.sh
@@ -7,7 +7,7 @@
 #    # Format files that differ from origin/main.
 #    bash format.sh
-#    # Commit changed files with message 'Run yapf and pylint'
+#    # Commit changed files with message 'Run yapf and ruff'
 #
 #
 # YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase.
@@ -22,7 +22,7 @@ ROOT="$(git rev-parse --show-toplevel)"
 builtin cd "$ROOT" || exit 1
 YAPF_VERSION=$(yapf --version | awk '{print $2}')
-PYLINT_VERSION=$(pylint --version | head -n 1 | awk '{print $2}')
+RUFF_VERSION=$(ruff --version | awk '{print $2}')
 MYPY_VERSION=$(mypy --version | awk '{print $2}')
 # # params: tool name, tool version, required version
@@ -34,7 +34,7 @@ tool_version_check() {
 }
 tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)"
-tool_version_check "pylint" $PYLINT_VERSION "$(grep "pylint==" requirements-dev.txt | cut -d'=' -f3)"
+tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)"
 tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)"
 YAPF_FLAGS=(
@@ -44,7 +44,6 @@ YAPF_FLAGS=(
 YAPF_EXCLUDES=(
    '--exclude' 'build/**'
    '--exclude' 'vllm/model_executor/parallel_utils/**'
 )
 # Format specified files
@@ -72,7 +71,7 @@ format_changed() {
 # Format all files
 format_all() {
-    yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" vllm
+    yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" vllm tests
 }
 ## This flag formats individual files. --files *must* be the first command line
@@ -94,9 +93,43 @@ echo 'vLLM yapf: Done'
 # echo 'vLLM mypy:'
 # mypy
-# Run Pylint
+# Lint specified files
-echo 'vLLM Pylint:'
+lint() {
-pylint vllm
+    ruff "$@"
 }
 # Lint files that differ from main branch. Ignores dirs that are not slated
 # for autolint yet.
 lint_changed() {
    # The `if` guard ensures that the list of filenames is not empty, which
    # could cause ruff to receive 0 positional arguments, making it hang
    # waiting for STDIN.
    #
    # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
    # exist on both branches.
    MERGEBASE="$(git merge-base origin/main HEAD)"
    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
             ruff
    fi
 }
 # Run Ruff
 echo 'vLLM Ruff:'
 ## This flag lints individual files. --files *must* be the first command line
 ## arg to use this option.
 if [[ "$1" == '--files' ]]; then
   lint "${@:2}"
   # If `--all` is passed, then any further arguments are ignored and the
   # entire python directory is linted.
 elif [[ "$1" == '--all' ]]; then
   lint vllm tests
 else
   # Format only the files that changed in last commit.
   lint_changed
 fi
 if ! git diff --quiet &>/dev/null; then
    echo 'Reformatted files. Please review and stage the changes.'
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,9 +1,34 @@
 [build-system]
 # Should be mirrored in requirements-build.txt
 requires = [
    "ninja",
    "packaging",
-    "setuptools",
+    "setuptools >= 49.4.0",
-    "torch >= 2.0.0",
+    "torch >= 2.1.0",
    "wheel",
 ]
 build-backend = "setuptools.build_meta"
 [tool.ruff.lint]
 select = [
    # pycodestyle
    "E",
    # Pyflakes
    "F",
    # pyupgrade
    # "UP",
    # flake8-bugbear
    "B",
    # flake8-simplify
    "SIM",
    # isort
    # "I",
 ]
 ignore = [
    # star imports
    "F405", "F403",
    # lambda expression assignment
    "E731",
    # line too long, handled by black formatting
    "E501",
 ]
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -0,0 +1,6 @@
 # Should be mirrored in pyproject.toml
 ninja
 packaging
 setuptools>=49.4.0
 torch>=2.1.0
 wheel
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,6 +1,6 @@
 # formatting
 yapf==0.32.0
-pylint==2.8.2
+ruff==0.1.5
 # type checking
 mypy==0.991
@@ -12,3 +12,4 @@ types-setuptools
 pytest
 pytest-forked
 pytest-asyncio
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,9 +5,11 @@ pandas  # Required for Ray data.
 pyarrow  # Required for Ray data.
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
-torch >= 2.0.0
+einops  # Required for phi-1_5
-transformers >= 4.33.1  # Required for Code Llama.
+torch >= 2.1.0
-xformers >= 0.0.22
+transformers >= 4.34.0  # Required for Mistral.
 xformers >= 0.0.22.post7  # Required for CUDA 12.1.
 fastapi
 uvicorn[standard]
-pydantic < 2  # Required for OpenAI server.
+pydantic == 1.10.13  # Required for OpenAI server.
 aioprometheus[starlette]
--- a/setup.py
+++ b/setup.py
@@ -12,8 +12,10 @@ from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
 ROOT_DIR = os.path.dirname(__file__)
 MAIN_CUDA_VERSION = "12.1"
 # Supported NVIDIA GPU architectures.
-SUPPORTED_ARCHS = ["7.0", "7.5", "8.0", "8.6", "8.9", "9.0"]
+SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
 # Compiler flags.
 CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
@@ -49,19 +51,33 @@ def get_torch_arch_list() -> Set[str]:
    # and executed on the 8.6 or newer architectures. While the PTX code will
    # not give the best performance on the newer architectures, it provides
    # forward compatibility.
-    valid_arch_strs = SUPPORTED_ARCHS + [s + "+PTX" for s in SUPPORTED_ARCHS]
+    env_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
-    arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
+    if env_arch_list is None:
    if arch_list is None:
        return set()
    # List are separated by ; or space.
-    arch_list = arch_list.replace(" ", ";").split(";")
+    torch_arch_list = set(env_arch_list.replace(" ", ";").split(";"))
-    for arch in arch_list:
+    if not torch_arch_list:
-        if arch not in valid_arch_strs:
+        return set()
-            raise ValueError(
+
-                f"Unsupported CUDA arch ({arch}). "
+    # Filter out the invalid architectures and print a warning.
-                f"Valid CUDA arch strings are: {valid_arch_strs}.")
+    valid_archs = SUPPORTED_ARCHS.union({s + "+PTX" for s in SUPPORTED_ARCHS})
-    return set(arch_list)
+    arch_list = torch_arch_list.intersection(valid_archs)
    # If none of the specified architectures are valid, raise an error.
    if not arch_list:
        raise RuntimeError(
            "None of the CUDA architectures in `TORCH_CUDA_ARCH_LIST` env "
            f"variable ({env_arch_list}) is supported. "
            f"Supported CUDA architectures are: {valid_archs}.")
    invalid_arch_list = torch_arch_list - valid_archs
    if invalid_arch_list:
        warnings.warn(
            f"Unsupported CUDA architectures ({invalid_arch_list}) are "
            "excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
            f"({env_arch_list}). Supported CUDA architectures are: "
            f"{valid_archs}.",
            stacklevel=2)
    return arch_list
 # First, check the TORCH_CUDA_ARCH_LIST environment variable.
@@ -81,7 +97,7 @@ nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
 if not compute_capabilities:
    # If no GPU is specified nor available, add all supported architectures
    # based on the NVCC CUDA version.
-    compute_capabilities = set(SUPPORTED_ARCHS)
+    compute_capabilities = SUPPORTED_ARCHS.copy()
    if nvcc_cuda_version < Version("11.1"):
        compute_capabilities.remove("8.6")
    if nvcc_cuda_version < Version("11.8"):
@@ -91,10 +107,10 @@ if not compute_capabilities:
 # Validate the NVCC CUDA version.
 if nvcc_cuda_version < Version("11.0"):
    raise RuntimeError("CUDA 11.0 or higher is required to build the package.")
-if nvcc_cuda_version < Version("11.1"):
+if (nvcc_cuda_version < Version("11.1")
-    if any(cc.startswith("8.6") for cc in compute_capabilities):
+        and any(cc.startswith("8.6") for cc in compute_capabilities)):
-        raise RuntimeError(
+    raise RuntimeError(
-            "CUDA 11.1 or higher is required for compute capability 8.6.")
+        "CUDA 11.1 or higher is required for compute capability 8.6.")
 if nvcc_cuda_version < Version("11.8"):
    if any(cc.startswith("8.9") for cc in compute_capabilities):
        # CUDA 11.8 is required to generate the code targeting compute capability 8.9.
@@ -104,7 +120,8 @@ if nvcc_cuda_version < Version("11.8"):
        # instead of 8.9.
        warnings.warn(
            "CUDA 11.8 or higher is required for compute capability 8.9. "
-            "Targeting compute capability 8.0 instead.")
+            "Targeting compute capability 8.0 instead.",
            stacklevel=2)
        compute_capabilities = set(cc for cc in compute_capabilities
                                   if not cc.startswith("8.9"))
        compute_capabilities.add("8.0+PTX")
@@ -125,93 +142,32 @@ if nvcc_cuda_version >= Version("11.2"):
    NVCC_FLAGS += ["--threads", str(num_threads)]
 ext_modules = []
-
+vllm_extension = CUDAExtension(
-# Cache operations.
+    name="vllm._C",
 cache_extension = CUDAExtension(
    name="vllm.cache_ops",
    sources=["csrc/cache.cpp", "csrc/cache_kernels.cu"],
    extra_compile_args={
        "cxx": CXX_FLAGS,
        "nvcc": NVCC_FLAGS,
    },
 )
 ext_modules.append(cache_extension)
 # Attention kernels.
 attention_extension = CUDAExtension(
    name="vllm.attention_ops",
    sources=["csrc/attention.cpp", "csrc/attention/attention_kernels.cu"],
    extra_compile_args={
        "cxx": CXX_FLAGS,
        "nvcc": NVCC_FLAGS,
    },
 )
 ext_modules.append(attention_extension)
 # Positional encoding kernels.
 positional_encoding_extension = CUDAExtension(
    name="vllm.pos_encoding_ops",
    sources=["csrc/pos_encoding.cpp", "csrc/pos_encoding_kernels.cu"],
    extra_compile_args={
        "cxx": CXX_FLAGS,
        "nvcc": NVCC_FLAGS,
    },
 )
 ext_modules.append(positional_encoding_extension)
 # Layer normalization kernels.
 layernorm_extension = CUDAExtension(
    name="vllm.layernorm_ops",
    sources=["csrc/layernorm.cpp", "csrc/layernorm_kernels.cu"],
    extra_compile_args={
        "cxx": CXX_FLAGS,
        "nvcc": NVCC_FLAGS,
    },
 )
 ext_modules.append(layernorm_extension)
 # Activation kernels.
 activation_extension = CUDAExtension(
    name="vllm.activation_ops",
    sources=["csrc/activation.cpp", "csrc/activation_kernels.cu"],
    extra_compile_args={
        "cxx": CXX_FLAGS,
        "nvcc": NVCC_FLAGS,
    },
 )
 ext_modules.append(activation_extension)
 # Quantization kernels.
 quantization_extension = CUDAExtension(
    name="vllm.quantization_ops",
    sources=[
-        "csrc/quantization.cpp",
+        "csrc/cache_kernels.cu",
        "csrc/attention/attention_kernels.cu",
        "csrc/pos_encoding_kernels.cu",
        "csrc/activation_kernels.cu",
        "csrc/layernorm_kernels.cu",
        "csrc/quantization/awq/gemm_kernels.cu",
        "csrc/quantization/squeezellm/quant_cuda_kernel.cu",
        "csrc/cuda_utils_kernels.cu",
        "csrc/pybind.cpp",
    ],
    extra_compile_args={
        "cxx": CXX_FLAGS,
        "nvcc": NVCC_FLAGS,
    },
 )
-ext_modules.append(quantization_extension)
+ext_modules.append(vllm_extension)
 # Misc. CUDA utils.
 cuda_utils_extension = CUDAExtension(
    name="vllm.cuda_utils",
    sources=["csrc/cuda_utils.cpp", "csrc/cuda_utils_kernels.cu"],
    extra_compile_args={
        "cxx": CXX_FLAGS,
        "nvcc": NVCC_FLAGS,
    },
 )
 ext_modules.append(cuda_utils_extension)
 def get_path(*filepath) -> str:
    return os.path.join(ROOT_DIR, *filepath)
-def find_version(filepath: str):
+def find_version(filepath: str) -> str:
    """Extract version information from the given filepath.
    Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
@@ -224,9 +180,22 @@ def find_version(filepath: str):
        raise RuntimeError("Unable to find version string.")
 def get_vllm_version() -> str:
    version = find_version(get_path("vllm", "__init__.py"))
    cuda_version = str(nvcc_cuda_version)
    if cuda_version != MAIN_CUDA_VERSION:
        cuda_version_str = cuda_version.replace(".", "")[:3]
        version += f"+cu{cuda_version_str}"
    return version
 def read_readme() -> str:
-    """Read the README file."""
+    """Read the README file if present."""
-    return io.open(get_path("README.md"), "r", encoding="utf-8").read()
+    p = get_path("README.md")
    if os.path.isfile(p):
        return io.open(get_path("README.md"), "r", encoding="utf-8").read()
    else:
        return ""
 def get_requirements() -> List[str]:
@@ -238,7 +207,7 @@ def get_requirements() -> List[str]:
 setuptools.setup(
    name="vllm",
-    version=find_version(get_path("vllm", "__init__.py")),
+    version=get_vllm_version(),
    author="vLLM Team",
    license="Apache 2.0",
    description=("A high-throughput and memory-efficient inference and "
@@ -264,4 +233,5 @@ setuptools.setup(
    install_requires=get_requirements(),
    ext_modules=ext_modules,
    cmdclass={"build_ext": BuildExtension},
    package_data={"vllm": ["py.typed"]},
 )
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -48,9 +48,9 @@ def test_api_server(api_server):
        result = None
        while not result:
            try:
-                for result in pool.map(_query_server, prompts):
+                for _ in pool.map(_query_server, prompts):
                    break
-            except:
+            except Exception:
                time.sleep(1)
        # Actual tests start here
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -32,12 +32,12 @@ class MockEngine:
        self.request_id = None
    def add_request(self, **kwargs):
        del kwargs  # Unused
        self.add_request_calls += 1
        return
    def abort_request(self, request_id):
        del request_id  # Unused
        self.abort_request_calls += 1
        return
 class MockAsyncLLMEngine(AsyncLLMEngine):
--- a/tests/async_engine/test_openai_server.py
+++ b/tests/async_engine/test_openai_server.py
@@ -0,0 +1,119 @@
 from argparse import Namespace
 from dataclasses import dataclass
 import pytest
 from fastapi.testclient import TestClient
 from vllm.entrypoints.openai.api_server import *
 # Define models, templates, and their corresponding expected outputs
 MODEL_TEMPLATE_GENERATON_OUTPUT = [
    ("facebook/opt-125m", None, True,
     "Hello</s>Hi there!</s>What is the capital of</s>"),
    ("facebook/opt-125m", None, False,
     "Hello</s>Hi there!</s>What is the capital of</s>"),
    ("facebook/opt-125m", "../../examples/template_chatml.jinja", True,
     """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
 Hi there!<|im_end|>
 <|im_start|>user
 What is the capital of<|im_end|>
 <|im_start|>assistant
 """),
    ("facebook/opt-125m", "../../examples/template_chatml.jinja", False,
     """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
 Hi there!<|im_end|>
 <|im_start|>user
 What is the capital of""")
 ]
 TEST_MESSAGES = [
    {
        'role': 'user',
        'content': 'Hello'
    },
    {
        'role': 'assistant',
        'content': 'Hi there!'
    },
    {
        'role': 'user',
        'content': 'What is the capital of'
    },
 ]
 client = TestClient(app)
@dataclass
 class MockTokenizer:
    chat_template = None
 def test_load_chat_template():
    # Testing chatml template
    template = "../../examples/template_chatml.jinja"
    mock_args = Namespace(chat_template=template)
    tokenizer = MockTokenizer()
    # Call the function with the mocked args
    load_chat_template(mock_args, tokenizer)
    template_content = tokenizer.chat_template
    # Test assertions
    assert template_content is not None
    # Hard coded value for template_chatml.jinja
    assert template_content == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
 {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""
 def test_no_load_chat_template():
    # Testing chatml template
    template = "../../examples/does_not_exist"
    mock_args = Namespace(chat_template=template)
    tokenizer = MockTokenizer()
    # Call the function with the mocked args
    load_chat_template(mock_args, tokenizer=tokenizer)
    template_content = tokenizer.chat_template
    # Test assertions
    assert template_content is not None
    # Hard coded value for template_chatml.jinja
    assert template_content == """../../examples/does_not_exist"""
@pytest.mark.asyncio
@pytest.mark.parametrize(
    "model,template,add_generation_prompt,expected_output",
    MODEL_TEMPLATE_GENERATON_OUTPUT)
 async def test_get_gen_prompt(model, template, add_generation_prompt,
                              expected_output):
    # Initialize the tokenizer
    tokenizer = get_tokenizer(tokenizer_name=model)
    mock_args = Namespace(chat_template=template)
    load_chat_template(mock_args, tokenizer)
    # Create a mock request object using keyword arguments
    mock_request = ChatCompletionRequest(
        model=model,
        messages=TEST_MESSAGES,
        add_generation_prompt=add_generation_prompt)
    # Call the function and get the result
    result = tokenizer.apply_chat_template(
        conversation=mock_request.messages,
        tokenize=False,
        add_generation_prompt=mock_request.add_generation_prompt)
    # Test assertion
    assert result == expected_output, f"The generated prompt does not match the expected output for model {model} and template {template}"
 def test_health_endpoint():
    response = client.get("/health")
    assert response.status_code == 200
--- a/tests/async_engine/test_request_tracker.py
+++ b/tests/async_engine/test_request_tracker.py
@@ -7,22 +7,22 @@ from vllm.outputs import RequestOutput
 class DummyEvent:
    def __init__(self):
-        self._flag = False
+        self.flag = False
    def set(self):
-        self._flag = True
+        self.flag = True
    def clear(self):
-        self._flag = False
+        self.flag = False
 def test_request_tracker():
    tracker = RequestTracker()
    tracker.new_requests_event = DummyEvent()
    stream_1 = tracker.add_request("1")
-    assert tracker.new_requests_event._flag
+    assert tracker.new_requests_event.flag
    new, finished = tracker.get_new_and_finished_requests()
-    assert not tracker.new_requests_event._flag
+    assert not tracker.new_requests_event.flag
    assert len(new) == 1
    assert new[0]["request_id"] == "1"
    assert not finished
@@ -30,9 +30,9 @@ def test_request_tracker():
    stream_2 = tracker.add_request("2")
    stream_3 = tracker.add_request("3")
-    assert tracker.new_requests_event._flag
+    assert tracker.new_requests_event.flag
    new, finished = tracker.get_new_and_finished_requests()
-    assert not tracker.new_requests_event._flag
+    assert not tracker.new_requests_event.flag
    assert len(new) == 2
    assert new[0]["request_id"] == "2"
    assert new[1]["request_id"] == "3"
@@ -43,7 +43,7 @@ def test_request_tracker():
    # request_ids must be unique
    with pytest.raises(KeyError):
        tracker.add_request("1")
-    assert not tracker.new_requests_event._flag
+    assert not tracker.new_requests_event.flag
    tracker.abort_request("1")
    new, finished = tracker.get_new_and_finished_requests()
@@ -54,7 +54,7 @@ def test_request_tracker():
    stream_4 = tracker.add_request("4")
    tracker.abort_request("4")
-    assert tracker.new_requests_event._flag
+    assert tracker.new_requests_event.flag
    new, finished = tracker.get_new_and_finished_requests()
    assert len(finished) == 1
    assert "4" in finished
@@ -62,11 +62,11 @@ def test_request_tracker():
    assert stream_4.finished
    stream_5 = tracker.add_request("5")
-    assert tracker.new_requests_event._flag
+    assert tracker.new_requests_event.flag
    tracker.process_request_output(
-        RequestOutput("2", "output", [], [], finished=True))
+        RequestOutput("2", "output", [], [], [], finished=True))
    new, finished = tracker.get_new_and_finished_requests()
-    assert not tracker.new_requests_event._flag
+    assert not tracker.new_requests_event.flag
    assert len(finished) == 1
    assert "2" in finished
    assert len(new) == 1
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -106,6 +106,39 @@ class HfRunner:
            outputs[i] = (output_ids, output_str)
        return outputs
    def generate_greedy_logprobs(
        self,
        prompts: List[str],
        max_tokens: int,
    ) -> List[List[torch.Tensor]]:
        all_logprobs = []
        for prompt in prompts:
            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
            output = self.model.generate(
                input_ids.cuda(),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
            )
            seq_logprobs = []
            for hidden_states in output.hidden_states:
                last_hidden_states = hidden_states[-1][0]
                logits = torch.matmul(
                    last_hidden_states,
                    self.model.get_output_embeddings().weight.t(),
                )
                if self.model.get_output_embeddings().bias is not None:
                    logits += self.model.get_output_embeddings(
                    ).bias.unsqueeze(0)
                logprobs = torch.nn.functional.log_softmax(logits,
                                                           dim=-1,
                                                           dtype=torch.float32)
                seq_logprobs.append(logprobs)
            all_logprobs.append(seq_logprobs)
        return all_logprobs
@pytest.fixture
 def hf_runner():
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -0,0 +1,83 @@
 """Test the communication operators.
 Run `pytest tests/distributed/test_comm_ops.py --forked`.
 """
 from multiprocessing import Process, set_start_method
 import pytest
 import torch
 from vllm.config import ParallelConfig
 from vllm.engine.ray_utils import get_open_port
 from vllm.model_executor.parallel_utils.communication_op import (
    tensor_model_parallel_all_reduce,
    tensor_model_parallel_all_gather,
 )
 from vllm.worker.worker import _init_distributed_environment
 def init_test_distributed_environment(pipeline_parallel_size: int,
                                      tensor_parallel_size: int, rank: int,
                                      distributed_init_port: str):
    parallel_config = ParallelConfig(pipeline_parallel_size,
                                     tensor_parallel_size,
                                     worker_use_ray=True)
    distributed_init_method = f"tcp://localhost:{distributed_init_port}"
    torch.cuda.set_device(rank)
    _init_distributed_environment(parallel_config, rank,
                                  distributed_init_method)
 def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
                           distributed_init_port: str):
    init_test_distributed_environment(1, tensor_parallel_size, rank,
                                      distributed_init_port)
    num_elements = 8
    all_tensors = [
        torch.arange(num_elements, dtype=torch.float32, device="cuda") *
        (r + 1) for r in range(tensor_parallel_size)
    ]
    expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
    t = all_tensors[rank]
    t = tensor_model_parallel_all_reduce(t)
    assert torch.allclose(t, expected)
 def all_gather_test_worker(tensor_parallel_size: int, rank: int,
                           distributed_init_port: str):
    init_test_distributed_environment(1, tensor_parallel_size, rank,
                                      distributed_init_port)
    num_dimensions = 3
    tensor_size = list(range(2, num_dimensions + 2))
    total_size = 1
    for s in tensor_size:
        total_size *= s
    for all_gather_dimension in range(num_dimensions):
        all_tensors = [
            torch.arange(total_size, dtype=torch.float32,
                         device="cuda").reshape(tensor_size) * (r + 1)
            for r in range(tensor_parallel_size)
        ]
        expected = torch.cat(all_tensors, dim=all_gather_dimension)
        t = all_tensors[rank]
        t = tensor_model_parallel_all_gather(t, all_gather_dimension)
        assert torch.allclose(t, expected)
@pytest.mark.skipif(torch.cuda.device_count() < 2,
                    reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize("tensor_parallel_size", [2])
@pytest.mark.parametrize("test_target",
                         [all_reduce_test_worker, all_gather_test_worker])
 def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
    set_start_method("spawn", force=True)
    distributed_init_port = get_open_port()
    processes = []
    for rank in range(tensor_parallel_size):
        p = Process(target=test_target,
                    args=(tensor_parallel_size, rank, distributed_init_port))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()
    assert all(p.exitcode == 0 for p in processes)
--- a/tests/engine/test_detokenize.py
+++ b/tests/engine/test_detokenize.py
@@ -5,9 +5,9 @@ from transformers import AutoTokenizer
 from vllm.transformers_utils.tokenizer import detokenize_incrementally
 TRUTH = [
-    "Hello here, this is a simple test",
+    "Hello here, this is a simple test",  # noqa: E501
-    "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving",
+    "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving",  # noqa: E501
-    "我很感谢你的热情"
+    "我很感谢你的热情"  # noqa: E501
 ]
 TOKENIZERS = [
    "facebook/opt-125m",
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -1,9 +1,7 @@
 import pytest
 import torch
 import torch.nn.functional as F
 from transformers.activations import get_activation
-from vllm import activation_ops
+from vllm.model_executor.layers.activation import FastGELU, NewGELU, SiluAndMul
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
@@ -11,11 +9,6 @@ D = [512, 4096, 5120, 13824]  # Arbitrary values for testing
 SEEDS = [0]
 def ref_silu_and_mul(x: torch.Tensor) -> torch.Tensor:
    x1, x2 = x.chunk(chunks=2, dim=1)
    return F.silu(x1) * x2
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("d", D)
@pytest.mark.parametrize("dtype", DTYPES)
@@ -29,10 +22,10 @@ def test_silu_and_mul(
 ) -> None:
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
-    x = torch.randn(num_tokens, 2 * d, dtype=dtype, device='cuda')
+    x = torch.randn(num_tokens, 2 * d, dtype=dtype, device="cuda")
-    out = torch.empty(num_tokens, d, dtype=dtype, device='cuda')
+    layer = SiluAndMul()
-    activation_ops.silu_and_mul(out, x)
+    out = layer(x)
-    ref_out = ref_silu_and_mul(x)
+    ref_out = layer._forward(x)
    assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
@@ -49,10 +42,10 @@ def test_gelu_new(
 ) -> None:
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
-    x = torch.randn(num_tokens, d, dtype=dtype, device='cuda')
+    x = torch.randn(num_tokens, d, dtype=dtype, device="cuda")
-    out = torch.empty(num_tokens, d, dtype=dtype, device='cuda')
+    layer = NewGELU()
-    activation_ops.gelu_new(out, x)
+    out = layer(x)
-    ref_out = get_activation("gelu_new")(x)
+    ref_out = layer._forward(x)
    assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
@@ -68,8 +61,8 @@ def test_gelu_fast(
 ) -> None:
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
-    x = torch.randn(num_tokens, d, dtype=dtype, device='cuda')
+    x = torch.randn(num_tokens, d, dtype=dtype, device="cuda")
-    out = torch.empty(num_tokens, d, dtype=dtype, device='cuda')
+    layer = FastGELU()
-    activation_ops.gelu_fast(out, x)
+    out = layer(x)
-    ref_out = get_activation("gelu_fast")(x)
+    ref_out = layer._forward(x)
    assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -6,21 +6,22 @@ import torch
 from xformers import ops as xops
 from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
-from vllm import attention_ops
+from vllm._C import ops
 from vllm.utils import get_max_shared_memory_bytes
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer
 MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
-NUM_BLOCKS = 128  # Arbitrary values for testing
+NUM_BLOCKS = 40000  # Arbitrary values for testing
 PARTITION_SIZE = 512
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_GEN_SEQS = [7]  # Arbitrary values for testing
-NUM_PREFILL_SEQS = [1, 3, 7]  # Arbitrary values for testing
+NUM_PREFILL_SEQS = [3]  # Arbitrary values for testing
 NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing
 HEAD_SIZES = [64, 80, 96, 112, 128, 256]
-BLOCK_SIZES = [8, 16, 32]
+BLOCK_SIZES = [16, 32]
 USE_ALIBI = [False, True]
 SEEDS = [0]
@@ -96,6 +97,7 @@ def ref_single_query_cached_kv_attention(
        output[i].copy_(out, non_blocking=True)
@pytest.mark.parametrize("version", ["v1", "v2"])
@pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@@ -103,9 +105,9 @@ def ref_single_query_cached_kv_attention(
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
-@torch.inference_mode()
+def test_paged_attention(
 def test_single_query_cached_kv_attention(
    kv_cache_factory,
    version: str,
    num_seqs: int,
    num_heads: Tuple[int, int],
    head_size: int,
@@ -162,19 +164,54 @@ def test_single_query_cached_kv_attention(
    # Call the paged attention kernel.
    output = torch.empty_like(query)
-    attention_ops.single_query_cached_kv_attention(
+    if version == "v1":
-        output,
+        ops.paged_attention_v1(
-        query,
+            output,
-        key_cache,
+            query,
-        value_cache,
+            key_cache,
-        head_mapping,
+            value_cache,
-        scale,
+            head_mapping,
-        block_tables,
+            scale,
-        context_lens,
+            block_tables,
-        block_size,
+            context_lens,
-        max_context_len,
+            block_size,
-        alibi_slopes,
+            max_context_len,
-    )
+            alibi_slopes,
        )
    elif version == "v2":
        num_partitions = ((max_context_len + PARTITION_SIZE - 1) //
                          PARTITION_SIZE)
        assert PARTITION_SIZE % block_size == 0
        num_seqs, num_heads, head_size = output.shape
        tmp_output = torch.empty(
            size=(num_seqs, num_heads, num_partitions, head_size),
            dtype=output.dtype,
            device=output.device,
        )
        exp_sums = torch.empty(
            size=(num_seqs, num_heads, num_partitions),
            dtype=torch.float32,
            device=output.device,
        )
        max_logits = torch.empty_like(exp_sums)
        ops.paged_attention_v2(
            output,
            exp_sums,
            max_logits,
            tmp_output,
            query,
            key_cache,
            value_cache,
            head_mapping,
            scale,
            block_tables,
            context_lens,
            block_size,
            max_context_len,
            alibi_slopes,
        )
    else:
        raise AssertionError(f"Unknown version: {version}")
    # Run the reference implementation.
    ref_output = torch.empty_like(query)
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -3,16 +3,16 @@ import random
 import pytest
 import torch
-from vllm import cache_ops
+from vllm._C import cache_ops
 DTYPES = [torch.half, torch.bfloat16, torch.float]
-NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
+NUM_TOKENS = [83]  # Arbitrary values for testing
-NUM_LAYERS = [5]  # Arbitrary values for testing
+NUM_LAYERS = [1]  # Arbitrary values for testing
 NUM_HEADS = [8]  # Arbitrary values for testing
 HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 BLOCK_SIZES = [8, 16, 32]
-NUM_BLOCKS = [1024]  # Arbitrary values for testing
+NUM_BLOCKS = [1024, 36000]  # Arbitrary values for testing
-NUM_MAPPINGS = [32, 256]  # Arbitrary values for testing
+NUM_MAPPINGS = [256]  # Arbitrary values for testing
 SEEDS = [0]
@@ -69,9 +69,9 @@ def test_copy_blocks(
    for src, dsts in block_mapping.items():
        for dst in dsts:
            for cloned_key_cache in cloned_key_caches:
-                cloned_key_cache[dst] = cloned_key_cache[src]
+                cloned_key_cache[dst].copy_(cloned_key_cache[src])
            for cloned_value_cache in cloned_value_caches:
-                cloned_value_cache[dst] = cloned_value_cache[src]
+                cloned_value_cache[dst].copy_(cloned_value_cache[src])
    # Compare the results.
    for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
@@ -106,14 +106,14 @@ def test_reshape_and_cache(
    # Create a random slot mapping.
    num_slots = block_size * num_blocks
    slot_mapping = random.sample(range(num_slots), num_tokens)
-    slot_mapping = torch.tensor(slot_mapping, dtype=torch.int, device='cuda')
+    slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device="cuda")
    qkv = torch.randn(num_tokens,
                      3,
                      num_heads,
                      head_size,
                      dtype=dtype,
-                      device='cuda')
+                      device="cuda")
    _, key, value = qkv.unbind(dim=1)
    # Create the KV caches.
@@ -132,7 +132,7 @@ def test_reshape_and_cache(
    # Run the reference implementation.
    reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
-    block_indicies = torch.div(slot_mapping, block_size, rounding_mode='floor')
+    block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
    block_indicies = block_indicies.cpu().tolist()
    block_offsets = slot_mapping % block_size
    block_offsets = block_offsets.cpu().tolist()
--- a/tests/kernels/test_layernorm.py
+++ b/tests/kernels/test_layernorm.py
@@ -1,58 +1,47 @@
 import pytest
 import torch
 import torch.nn as nn
-from vllm import layernorm_ops
+from vllm.model_executor.layers.layernorm import RMSNorm
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 HIDDEN_SIZES = [67, 768, 2048, 5120, 8192]  # Arbitrary values for testing
 NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
 HIDDEN_SIZES = [768, 5120, 8192]  # Arbitrary values for testing
 ADD_RESIDUAL = [False, True]
 SEEDS = [0]
 class RefRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        super().__init__()
        weight = torch.empty(hidden_size)
        weight.normal_(mean=1.0, std=0.1)
        self.weight = nn.Parameter(weight)
        self.variance_epsilon = eps
    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance +
                                                    self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@torch.inference_mode()
 def test_rms_norm(
    num_tokens: int,
    hidden_size: int,
    add_residual: bool,
    dtype: torch.dtype,
    seed: int,
 ) -> None:
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
-    scale = float(hidden_size**-0.5)
+    layer = RMSNorm(hidden_size).to(dtype).cuda()
-    x = torch.empty(num_tokens, hidden_size, dtype=dtype, device="cuda")
+    layer.weight.data.normal_(mean=1.0, std=0.1)
-    x.uniform_(-scale, scale)
+    scale = 1 / (2 * hidden_size)
-    ref = RefRMSNorm(hidden_size).to(dtype).cuda()
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype, device="cuda")
    x *= scale
    residual = torch.randn_like(x) * scale if add_residual else None
-    out = torch.empty_like(x)
+    # NOTE(woosuk): The reference implementation should be executed first
-    layernorm_ops.rms_norm(
+    # because the custom kernel is in-place.
-        out,
+    ref_out = layer._forward(x, residual)
-        x,
+    out = layer(x, residual)
-        ref.weight.data,
+    # NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
-        ref.variance_epsilon,
+    # numerical errors than other operators because they involve reductions.
-    )
+    # Therefore, we use a larger tolerance.
-    ref_out = ref(x)
+    if add_residual:
-    assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-5)
+        assert torch.allclose(out[0], ref_out[0], atol=1e-2, rtol=1e-2)
        assert torch.allclose(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
    else:
        assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-2)
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -1,105 +1,23 @@
-from typing import Optional, Tuple
+from typing import Optional
 import pytest
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from vllm import pos_encoding_ops
+from vllm.model_executor.layers.rotary_embedding import get_rope
 IS_NEOX_STYLE = [True, False]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 ROTARY_DIMS = [None, 32]  # None means rotary dim == head size
-NUM_HEADS = [7, 12, 40, 52]  # Arbitrary values for testing
+NUM_HEADS = [7, 17]  # Arbitrary values for testing
-NUM_TOKENS = [11, 83, 2048]  # Arbitrary values for testing
+BATCH_SIZES = [1, 5]  # Arbitrary values for testing
 SEQ_LENS = [11, 8192]  # Arbitrary values for testing
 SEEDS = [0]
 def rotate_neox(x: torch.Tensor) -> torch.Tensor:
    x1 = x[..., :x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2:]
    return torch.cat((-x2, x1), dim=-1)
 def rotate_gptj(x: torch.Tensor) -> torch.Tensor:
    x1 = x[..., ::2]
    x2 = x[..., 1::2]
    x = torch.stack((-x2, x1), dim=-1)
    return x.flatten(-2)
 def apply_rope(
    q: torch.Tensor,
    k: torch.Tensor,
    cos: torch.Tensor,
    sin: torch.Tensor,
    is_neox_style: bool,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
    rotate_fn = rotate_neox if is_neox_style else rotate_gptj
    q_embed = (q * cos) + (rotate_fn(q) * sin)
    k_embed = (k * cos) + (rotate_fn(k) * sin)
    return q_embed, k_embed
 class RefRotaryEmbedding(nn.Module):
    """Reference implementation of rotary embedding."""
    def __init__(
        self,
        dim: int,
        is_neox_style: bool,
        max_position_embeddings: int = 8192,
        base: int = 10000,
    ) -> None:
        super().__init__()
        self.rotary_dim = dim
        self.is_neox_style = is_neox_style
        self.max_position_embeddings = max_position_embeddings
        # Create cos and sin embeddings.
        inv_freq = 1.0 / (base**(torch.arange(0, dim, 2) / dim))
        t = torch.arange(max_position_embeddings).float()
        freqs = torch.einsum("i,j->ij", t, inv_freq.float())
        if is_neox_style:
            emb = torch.cat((freqs, freqs), dim=-1)
        else:
            emb = torch.repeat_interleave(freqs, 2, -1)
        cos = emb.cos().to(dtype=inv_freq.dtype)
        sin = emb.sin().to(dtype=inv_freq.dtype)
        self.register_buffer("cos_cached", cos, persistent=False)
        self.register_buffer("sin_cached", sin, persistent=False)
    def forward(
        self,
        positions: torch.Tensor,  # [num_tokens]
        query: torch.Tensor,  # [num_tokens, num_heads, head_size]
        key: torch.Tensor,  # [num_tokens, num_heads, head_size]
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        query_rot = query[..., :self.rotary_dim]
        query_pass = query[..., self.rotary_dim:]
        key_rot = key[..., :self.rotary_dim]
        key_pass = key[..., self.rotary_dim:]
        query_rot = query_rot.transpose(0, 1)
        key_rot = key_rot.transpose(0, 1)
        cos = F.embedding(positions, self.cos_cached)
        sin = F.embedding(positions, self.sin_cached)
        query_rot, key_rot = apply_rope(query_rot, key_rot, cos, sin,
                                        self.is_neox_style)
        query_rot = query_rot.transpose(0, 1).contiguous()
        key_rot = key_rot.transpose(0, 1).contiguous()
        query = torch.cat((query_rot, query_pass), dim=-1)
        key = torch.cat((key_rot, key_pass), dim=-1)
        # Output query/key shape: [num_tokens, num_tokens, head_size]
        return query, key
@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
-@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
@pytest.mark.parametrize("seq_len", SEQ_LENS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
@@ -108,7 +26,8 @@ class RefRotaryEmbedding(nn.Module):
@torch.inference_mode()
 def test_rotary_embedding(
    is_neox_style: bool,
-    num_tokens: int,
+    batch_size: int,
    seq_len: int,
    num_heads: int,
    head_size: int,
    rotary_dim: Optional[int],
@@ -122,53 +41,25 @@ def test_rotary_embedding(
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
-    positions = torch.randint(0, max_position, (num_tokens, ), device="cuda")
+    if rotary_dim is None:
-    query = torch.randn(num_tokens,
+        rotary_dim = head_size
    rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
    rope = rope.to(dtype).cuda()
    positions = torch.randint(0,
                              max_position, (batch_size, seq_len),
                              device="cuda")
    query = torch.randn(batch_size,
                        seq_len,
                        num_heads * head_size,
                        dtype=dtype,
                        device="cuda")
-    key = torch.randn(num_tokens,
+    key = torch.randn_like(query)
                      num_heads * head_size,
                      dtype=dtype,
                      device="cuda")
    # Create the rotary embedding.
    inv_freq = 1.0 / (base**(
        torch.arange(0, rotary_dim, 2, dtype=torch.float) / rotary_dim))
    t = torch.arange(max_position).float()
    freqs = torch.einsum("i,j -> ij", t, inv_freq)
    cos = freqs.cos()
    sin = freqs.sin()
    cos_sin_cache = torch.cat((cos, sin), dim=-1)
    cos_sin_cache = cos_sin_cache.to(dtype=dtype, device='cuda')
    # Run the kernel. The kernel is in-place, so we need to clone the inputs.
    out_query = query.clone()
    out_key = key.clone()
    pos_encoding_ops.rotary_embedding(
        positions,
        out_query,
        out_key,
        head_size,
        cos_sin_cache,
        is_neox_style,
    )
    # Run the reference implementation.
    ref_rotary_embedding = RefRotaryEmbedding(
        dim=rotary_dim,
        is_neox_style=is_neox_style,
        max_position_embeddings=max_position,
        base=base,
    ).to(dtype=dtype, device="cuda")
    ref_query, ref_key = ref_rotary_embedding(
        positions,
        query.view(num_tokens, num_heads, head_size),
        key.view(num_tokens, num_heads, head_size),
    )
    ref_query = ref_query.view(num_tokens, num_heads * head_size)
    ref_key = ref_key.view(num_tokens, num_heads * head_size)
    # NOTE(woosuk): The reference implementation should be executed first
    # because the custom kernel is in-place.
    ref_query, ref_key = rope._forward(positions, query, key)
    out_query, out_key = rope.forward(positions, query, key)
    # Compare the results.
    assert torch.allclose(out_query, ref_query, atol=1e-5, rtol=1e-5)
    assert torch.allclose(out_key, ref_key, atol=1e-5, rtol=1e-5)
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -6,14 +6,16 @@ import pytest
 MODELS = [
    "facebook/opt-125m",
    "meta-llama/Llama-2-7b-hf",
    "mistralai/Mistral-7B-v0.1",
    "tiiuae/falcon-7b",
    "gpt2",
    "bigcode/tiny_starcoder_py",
    "EleutherAI/gpt-j-6b",
    "EleutherAI/pythia-70m",
    "bigscience/bloom-560m",
    "mosaicml/mpt-7b",
-    "tiiuae/falcon-7b",
+    "microsoft/phi-1_5",
    "meta-llama/Llama-2-7b-hf",
 ]
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -0,0 +1,55 @@
 import pytest
 import torch
 from vllm import SamplingParams
 MODELS = ["facebook/opt-125m"]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
 def test_get_prompt_logprobs(
    hf_runner,
    vllm_runner,
    model,
    dtype,
    example_prompts,
 ):
    max_tokens = 5
    hf_model = hf_runner(model, dtype=dtype)
    hf_logprobs = hf_model.generate_greedy_logprobs(
        example_prompts,
        max_tokens=max_tokens,
    )
    del hf_model
    vllm_model = vllm_runner(model, dtype=dtype)
    vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
                                          logprobs=5,
                                          prompt_logprobs=5,
                                          temperature=0.0)
    vllm_results = vllm_model.model.generate(
        example_prompts, sampling_params=vllm_sampling_params)
    # Test whether logprobs are included in the results.
    for result in vllm_results:
        assert result.prompt_logprobs is not None
        assert result.outputs[0].logprobs is not None
    # Test whether prompt logprobs are consistent with HF
    for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs):
        # Check prompt logprobs
        vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
        for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
            for token_id, logprob in vllm_prompt_logprob_dict.items():
                torch.testing.assert_close(logprob,
                                           hf_logprob[0][i][token_id].item(),
                                           atol=1e-2,
                                           rtol=1e-2)
        vllm_sample_logprobs = vllm_result.outputs[0].logprobs
        for i, vllm_sample_logprob_dict in enumerate(vllm_sample_logprobs):
            for token_id, logprob in vllm_sample_logprob_dict.items():
                torch.testing.assert_close(logprob,
                                           hf_logprob[i][-1][token_id].item(),
                                           atol=1e-2,
                                           rtol=1e-2)
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -1,14 +1,14 @@
 import pytest
 import random
 from typing import Tuple
 from unittest.mock import patch
 import pytest
 import torch
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.worker.worker import Worker
+from vllm.worker.model_runner import ModelRunner
 class MockLogitsSampler(Sampler):
@@ -19,15 +19,15 @@ class MockLogitsSampler(Sampler):
    def forward(self, *args, **kwargs):
        with patch("vllm.model_executor.layers.sampler._prune_hidden_states",
-                   lambda x, y: x):
+                   lambda x, y: x), patch(
-            with patch("vllm.model_executor.layers.sampler._get_logits",
+                       "vllm.model_executor.layers.sampler._get_logits",
                       lambda *args, **kwargs: self.fake_logits):
-                return super().forward(*args, **kwargs)
+            return super().forward(*args, **kwargs)
 def _prepare_test(
    batch_size: int
-) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler, Worker]:
+) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler, ModelRunner]:
    vocab_size = 32000
    input_tensor = torch.rand((batch_size, 1024),
                              device="cuda",
@@ -37,9 +37,8 @@ def _prepare_test(
                             device=input_tensor.device,
                             dtype=input_tensor.dtype)
    sampler = MockLogitsSampler(32000, fake_logits)
-    worker = Worker(None, None, None)
+    model_runner = ModelRunner(None, None, None)
-    worker.block_size = 16
+    return input_tensor, fake_logits, sampler, model_runner
    return input_tensor, fake_logits, sampler, worker
 RANDOM_SEEDS = list(range(128))
@@ -49,9 +48,11 @@ RANDOM_SEEDS = list(range(128))
 def test_sampler_all_greedy(seed: int):
    set_random_seed(seed)
    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, sampler, worker = _prepare_test(batch_size)
+    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
        batch_size)
    seq_group_metadata_list = []
    prompt_lens = []
    for i in range(batch_size):
        seq_group_metadata_list.append(
            SequenceGroupMetadata(
@@ -61,14 +62,16 @@ def test_sampler_all_greedy(seed: int):
                sampling_params=SamplingParams(temperature=0, ),
                block_tables={0: [1]},
            ))
        prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-    _, _, input_metadata = worker._prepare_inputs(seq_group_metadata_list)
+    sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
                                                     prompt_lens)
    sampler_output = sampler(embedding=None,
                             hidden_states=input_tensor,
-                             input_metadata=input_metadata)
+                             sampling_metadata=sampling_metadata)
    expected = torch.argmax(fake_logits, dim=-1)
    for i, sequence_output in enumerate(sampler_output):
-        for nth_output in sequence_output:
+        for nth_output in sequence_output.samples:
            assert nth_output.output_token == expected[i].item()
@@ -76,12 +79,14 @@ def test_sampler_all_greedy(seed: int):
 def test_sampler_all_random(seed: int):
    set_random_seed(seed)
    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, sampler, worker = _prepare_test(batch_size)
+    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
        batch_size)
    for i in range(batch_size):
        fake_logits[i, i] = 1e2
    seq_group_metadata_list = []
    prompt_lens = []
    for i in range(batch_size):
        seq_group_metadata_list.append(
            SequenceGroupMetadata(
@@ -94,13 +99,15 @@ def test_sampler_all_random(seed: int):
                ),
                block_tables={0: [1]},
            ))
        prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-    _, _, input_metadata = worker._prepare_inputs(seq_group_metadata_list)
+    sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
                                                     prompt_lens)
    sampler_output = sampler(embedding=None,
                             hidden_states=input_tensor,
-                             input_metadata=input_metadata)
+                             sampling_metadata=sampling_metadata)
    for i, sequence_output in enumerate(sampler_output):
-        for nth_output in sequence_output:
+        for nth_output in sequence_output.samples:
            assert nth_output.output_token == i
@@ -108,9 +115,10 @@ def test_sampler_all_random(seed: int):
 def test_sampler_all_beam(seed: int):
    set_random_seed(seed)
    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, sampler, worker = _prepare_test(batch_size)
+    input_tensor, _, sampler, model_runner = _prepare_test(batch_size)
    seq_group_metadata_list = []
    prompt_lens = []
    for i in range(batch_size):
        seq_group_metadata_list.append(
            SequenceGroupMetadata(
@@ -124,11 +132,13 @@ def test_sampler_all_beam(seed: int):
                ),
                block_tables={0: [1]},
            ))
        prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-    _, _, input_metadata = worker._prepare_inputs(seq_group_metadata_list)
+    sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
                                                     prompt_lens)
    sampler(embedding=None,
            hidden_states=input_tensor,
-            input_metadata=input_metadata)
+            sampling_metadata=sampling_metadata)
    # no assertion here as I am not sure how to determine whether
    # the outputs are expected - in other words, this just tests
    # whether there are no exceptions in the sampler
@@ -139,10 +149,12 @@ def test_sampler_all_beam(seed: int):
 def test_sampler_mixed(seed: int):
    set_random_seed(seed)
    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, sampler, worker = _prepare_test(batch_size)
+    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
        batch_size)
    seq_group_metadata_list = []
    expected_tokens = []
    prompt_lens = []
    for i in range(batch_size):
        n = 1
        sampling_type = random.randint(0, 2)
@@ -172,13 +184,52 @@ def test_sampler_mixed(seed: int):
                sampling_params=sampling_params,
                block_tables={0: [1]},
            ))
        prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-    _, _, input_metadata = worker._prepare_inputs(seq_group_metadata_list)
+    sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
                                                     prompt_lens)
    sampler_output = sampler(embedding=None,
                             hidden_states=input_tensor,
-                             input_metadata=input_metadata)
+                             sampling_metadata=sampling_metadata)
    for i, sequence_output in enumerate(sampler_output):
        if seq_group_metadata_list[i].sampling_params.use_beam_search:
            continue
-        for nth_output in sequence_output:
+        for nth_output in sequence_output.samples:
            assert nth_output.output_token in expected_tokens
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
 def test_sampler_logits_processors(seed: int):
    set_random_seed(seed)
    batch_size = random.randint(1, 256)
    input_tensor, _, sampler, model_runner = _prepare_test(batch_size)
    # This sample logits processor gives infinite score to the i-th token,
    # where i is the length of the input sequence.
    # We therefore expect the output token sequence to be [0, 1, 2, ...]
    def pick_ith(token_ids, logits):
        logits[len(token_ids)] = float("inf")
        return logits
    seq_group_metadata_list = []
    prompt_lens = []
    for i in range(batch_size):
        seq_group_metadata_list.append(
            SequenceGroupMetadata(
                request_id=f"test_{i}",
                is_prompt=True,
                seq_data={0: SequenceData([1, 2, 3])},
                sampling_params=SamplingParams(temperature=0,
                                               logits_processors=[pick_ith]),
                block_tables={0: [1]},
            ))
        prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
    sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
                                                     prompt_lens)
    sampler_output = sampler(embedding=None,
                             hidden_states=input_tensor,
                             sampling_metadata=sampling_metadata)
    for _, sequence_output in enumerate(sampler_output):
        for idx, nth_output in enumerate(sequence_output.samples):
            assert nth_output.output_token == idx
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -0,0 +1,27 @@
 """Containing tests that check for regressions in vLLM's behavior.
 It should include tests that are reported by users and making sure they
 will never happen again.
 """
 from vllm import LLM, SamplingParams
 def test_duplicated_ignored_sequence_group():
    """https://github.com/vllm-project/vllm/issues/1655"""
    sampling_params = SamplingParams(temperature=0.01,
                                     top_p=0.1,
                                     max_tokens=256)
    llm = LLM(model="facebook/opt-125m",
              max_num_batched_tokens=4096,
              tensor_parallel_size=1)
    prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
    outputs = llm.generate(prompts, sampling_params=sampling_params)
    assert len(prompts) == len(outputs)
 if __name__ == "__main__":
    import pytest
    pytest.main([__file__])
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -0,0 +1,48 @@
 import random
 import torch
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
 from vllm.worker.model_runner import ModelRunner
 def test_prepare_prompt():
    model_runner = ModelRunner(None, None, None)
    model_runner.set_block_size(16)
    batch_size = random.randint(1, 256)
    prompt_lens = []
    seq_group_metadata_list = []
    for i in range(batch_size):
        # make sure all tokens fit into one block
        prompt_len = i % (model_runner.block_size - 1) + 1
        prompt_lens.append(prompt_len)
        seq_data = list(range(prompt_len))
        seq_group_metadata_list.append(
            SequenceGroupMetadata(
                request_id=f"test_{i}",
                is_prompt=True,
                seq_data={0: SequenceData(seq_data)},
                sampling_params=SamplingParams(temperature=0),
                block_tables={0: [1]},
            ))
    expected_selected_token_indices = []
    selected_token_start_idx = 0
    max_seq_len = max(prompt_lens)
    for prompt_len in prompt_lens:
        expected_selected_token_indices.append(selected_token_start_idx +
                                               prompt_len - 1)
        selected_token_start_idx += max_seq_len
    input_tokens, input_positions, _ = model_runner._prepare_prompt(
        seq_group_metadata_list)
    sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
                                                     prompt_lens)
    assert input_tokens.shape == (batch_size, max_seq_len)
    assert input_positions.shape == (batch_size, max_seq_len)
    torch.testing.assert_close(input_tokens, input_positions)
    actual = sampling_metadata.selected_token_indices
    expected = torch.tensor(expected_selected_token_indices,
                            device=actual.device,
                            dtype=actual.dtype)
    torch.testing.assert_close(actual, expected)
--- a/vllm/init.py
+++ b/vllm/init.py
@@ -8,7 +8,7 @@ from vllm.entrypoints.llm import LLM
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import SamplingParams
-__version__ = "0.2.0"
+__version__ = "0.2.3"
 __all__ = [
    "LLM",
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,4 +1,5 @@
-from typing import Optional
+from typing import Optional, Union
 import os
 import torch
 from transformers import PretrainedConfig
@@ -41,6 +42,9 @@ class ModelConfig:
        revision: The specific model version to use. It can be a branch name,
            a tag name, or a commit id. If unspecified, will use the default
            version.
        tokenizer_revision: The specific tokenizer version to use. It can be a
            branch name, a tag name, or a commit id. If unspecified, will use
            the default version.
        max_model_len: Maximum length of a sequence (including prompt and
            output). If None, will be derived from the model.
        quantization: Quantization method that was used to quantize the model
@@ -55,9 +59,10 @@ class ModelConfig:
        trust_remote_code: bool,
        download_dir: Optional[str],
        load_format: str,
-        dtype: str,
+        dtype: Union[str, torch.dtype],
        seed: int,
        revision: Optional[str] = None,
        tokenizer_revision: Optional[str] = None,
        max_model_len: Optional[int] = None,
        quantization: Optional[str] = None,
    ) -> None:
@@ -69,9 +74,21 @@ class ModelConfig:
        self.load_format = load_format
        self.seed = seed
        self.revision = revision
        self.tokenizer_revision = tokenizer_revision
        self.quantization = quantization
-        self.hf_config = get_config(model, trust_remote_code, revision)
+        if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true":
            # download model from ModelScope hub,
            # lazy import so that modelscope is not required for normal use.
            from modelscope.hub.snapshot_download import snapshot_download  # pylint: disable=C
            model_path = snapshot_download(model_id=model,
                                           cache_dir=download_dir,
                                           revision=revision)
            self.model = model_path
            self.download_dir = model_path
            self.tokenizer = model_path
        self.hf_config = get_config(self.model, trust_remote_code, revision)
        self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
        self.max_model_len = _get_and_verify_max_len(self.hf_config,
                                                     max_model_len)
@@ -98,15 +115,31 @@ class ModelConfig:
        self.tokenizer_mode = tokenizer_mode
    def _verify_quantization(self) -> None:
-        supported_quantization = ["awq"]
+        supported_quantization = ["awq", "squeezellm"]
-        if self.quantization is None:
+        if self.quantization is not None:
-            return
+            self.quantization = self.quantization.lower()
-        quantization = self.quantization.lower()
+
-        if quantization not in supported_quantization:
+        # Parse quantization method from the HF model config, if available.
-            raise ValueError(
+        hf_quant_config = getattr(self.hf_config, "quantization_config", None)
-                f"Unknown quantization: {self.quantization}. Must be one of "
+        if hf_quant_config is not None:
-                f"{supported_quantization}.")
+            hf_quant_method = str(hf_quant_config["quant_method"]).lower()
-        self.quantization = quantization
+            if self.quantization is None:
                self.quantization = hf_quant_method
            elif self.quantization != hf_quant_method:
                raise ValueError(
                    "Quantization method specified in the model config "
                    f"({hf_quant_method}) does not match the quantization "
                    f"method specified in the `quantization` argument "
                    f"({self.quantization}).")
        if self.quantization is not None:
            if self.quantization not in supported_quantization:
                raise ValueError(
                    f"Unknown quantization method: {self.quantization}. Must "
                    f"be one of {supported_quantization}.")
            logger.warning(f"{self.quantization} quantization is not fully "
                           "optimized yet. The speed can be slower than "
                           "non-quantized models.")
    def verify_with_parallel_config(
        self,
@@ -128,6 +161,12 @@ class ModelConfig:
                "must be divisible by pipeline parallel size "
                f"({pipeline_parallel_size}).")
    def get_sliding_window(self) -> Optional[int]:
        return getattr(self.hf_config, "sliding_window", None)
    def get_vocab_size(self) -> int:
        return self.hf_config.vocab_size
    def get_hidden_size(self) -> int:
        return self.hf_config.hidden_size
@@ -135,10 +174,10 @@ class ModelConfig:
        # FIXME(woosuk): This may not be true for all models.
        return self.hf_config.hidden_size // self.hf_config.num_attention_heads
-    def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
+    def get_total_num_kv_heads(self) -> int:
-        """Returns the number of KV heads per GPU worker."""
+        """Returns the total number of KV heads."""
        # For GPTBigCode & Falcon:
-        # Note: for falcon, when new_decoder_architecture is True, the
+        # NOTE: for falcon, when new_decoder_architecture is True, the
        # multi_query flag is ignored and we use n_head_kv for the number of
        # KV heads.
        falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"]
@@ -150,19 +189,34 @@ class ModelConfig:
            # Multi-query attention, only one KV head.
            # Currently, tensor parallelism is not supported in this case.
            return 1
-        # For Falcon:
+
-        if getattr(self.hf_config, "n_head_kv", None) is not None:
+        attributes = [
-            return (self.hf_config.n_head_kv //
+            # For Falcon:
-                    parallel_config.tensor_parallel_size)
+            "n_head_kv",
-        if getattr(self.hf_config, "num_kv_heads", None) is not None:
+            "num_kv_heads",
-            return (self.hf_config.num_kv_heads //
+            # For LLaMA-2:
-                    parallel_config.tensor_parallel_size)
+            "num_key_value_heads",
-        # For LLaMA-2:
+            # For ChatGLM:
-        if getattr(self.hf_config, "num_key_value_heads", None) is not None:
+            "multi_query_group_num",
-            return (self.hf_config.num_key_value_heads //
+        ]
-                    parallel_config.tensor_parallel_size)
+        for attr in attributes:
-        total_num_attention_heads = self.hf_config.num_attention_heads
+            num_kv_heads = getattr(self.hf_config, attr, None)
-        return total_num_attention_heads // parallel_config.tensor_parallel_size
+            if num_kv_heads is not None:
                return num_kv_heads
        # For non-grouped-query attention models, the number of KV heads is
        # equal to the number of attention heads.
        return self.hf_config.num_attention_heads
    def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
        """Returns the number of KV heads per GPU."""
        total_num_kv_heads = self.get_total_num_kv_heads()
        # If tensor parallelism is used, we divide the number of KV heads by
        # the tensor parallel size. We will replicate the KV heads in the
        # case where the number of KV heads is smaller than the tensor
        # parallel size so each GPU has at least one KV head.
        return max(1,
                   total_num_kv_heads // parallel_config.tensor_parallel_size)
    def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
        total_num_hidden_layers = self.hf_config.num_hidden_layers
@@ -237,10 +291,12 @@ class ParallelConfig:
        pipeline_parallel_size: int,
        tensor_parallel_size: int,
        worker_use_ray: bool,
        max_parallel_loading_workers: Optional[int] = None,
    ) -> None:
        self.pipeline_parallel_size = pipeline_parallel_size
        self.tensor_parallel_size = tensor_parallel_size
        self.worker_use_ray = worker_use_ray
        self.max_parallel_loading_workers = max_parallel_loading_workers
        self.world_size = pipeline_parallel_size * tensor_parallel_size
        if self.world_size > 1:
@@ -263,6 +319,7 @@ class SchedulerConfig:
            iteration.
        max_model_len: Maximum length of a sequence (including prompt
            and generated text).
        max_paddings: Maximum number of paddings to be added to a batch.
    """
    def __init__(
@@ -270,6 +327,7 @@ class SchedulerConfig:
        max_num_batched_tokens: Optional[int],
        max_num_seqs: int,
        max_model_len: int,
        max_paddings: int,
    ) -> None:
        if max_num_batched_tokens is not None:
            self.max_num_batched_tokens = max_num_batched_tokens
@@ -279,6 +337,7 @@ class SchedulerConfig:
            self.max_num_batched_tokens = max(max_model_len, 2048)
        self.max_num_seqs = max_num_seqs
        self.max_model_len = max_model_len
        self.max_paddings = max_paddings
        self._verify_args()
    def _verify_args(self) -> None:
@@ -308,7 +367,7 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
 def _get_and_verify_dtype(
    config: PretrainedConfig,
-    dtype: str,
+    dtype: Union[str, torch.dtype],
 ) -> torch.dtype:
    # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
    # because config.torch_dtype can be None.
@@ -316,17 +375,23 @@ def _get_and_verify_dtype(
    if config_dtype is None:
        config_dtype = torch.float32
-    dtype = dtype.lower()
+    if isinstance(dtype, str):
-    if dtype == "auto":
+        dtype = dtype.lower()
-        if config_dtype == torch.float32:
+        if dtype == "auto":
-            # Following the common practice, we use float16 for float32 models.
+            if config_dtype == torch.float32:
-            torch_dtype = torch.float16
+                # Following the common practice, we use float16 for float32
                # models.
                torch_dtype = torch.float16
            else:
                torch_dtype = config_dtype
        else:
-            torch_dtype = config_dtype
+            if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
                raise ValueError(f"Unknown dtype: {dtype}")
            torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
    elif isinstance(dtype, torch.dtype):
        torch_dtype = dtype
    else:
-        if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
+        raise ValueError(f"Unknown dtype: {dtype}")
            raise ValueError(f"Unknown dtype: {dtype}")
        torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
    # Verify the dtype.
    if torch_dtype != config_dtype:
@@ -340,15 +405,6 @@ def _get_and_verify_dtype(
            # Casting between float16 and bfloat16 is allowed with a warning.
            logger.warning(f"Casting {config_dtype} to {torch_dtype}.")
    # Check if the GPU supports the dtype.
    if torch_dtype == torch.bfloat16:
        compute_capability = torch.cuda.get_device_capability()
        if compute_capability[0] < 8:
            gpu_name = torch.cuda.get_device_name()
            raise ValueError(
                "Bfloat16 is only supported on GPUs with compute capability "
                f"of at least 8.0. Your {gpu_name} GPU has compute capability "
                f"{compute_capability[0]}.{compute_capability[1]}.")
    return torch_dtype
@@ -365,6 +421,8 @@ def _get_and_verify_max_len(
        "n_positions",
        # MPT
        "max_seq_len",
        # ChatGLM2
        "seq_length",
        # Others
        "max_sequence_length",
        "max_seq_length",
@@ -391,6 +449,9 @@ def _get_and_verify_max_len(
    if rope_scaling is not None:
        assert "factor" in rope_scaling
        scaling_factor = rope_scaling["factor"]
        if rope_scaling["type"] == "yarn":
            derived_max_model_len = rope_scaling[
                "original_max_position_embeddings"]
        derived_max_model_len *= scaling_factor
    if max_model_len is None:
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -1,10 +1,14 @@
 """A block manager that manages token blocks."""
 import enum
 from typing import Dict, List, Optional, Set, Tuple
 from vllm.block import PhysicalTokenBlock
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
 from vllm.utils import Device
 # Mapping: logical block number -> physical block.
 BlockTable = List[PhysicalTokenBlock]
 class BlockAllocator:
    """Manages free physical token blocks for a device.
@@ -25,7 +29,7 @@ class BlockAllocator:
        self.num_blocks = num_blocks
        # Initialize the free blocks.
-        self.free_blocks: List[PhysicalTokenBlock] = []
+        self.free_blocks: BlockTable = []
        for i in range(num_blocks):
            block = PhysicalTokenBlock(device=device,
                                       block_number=i,
@@ -50,8 +54,18 @@ class BlockAllocator:
        return len(self.free_blocks)
-# Mapping: logical block number -> physical block.
+class AllocStatus(enum.Enum):
-BlockTable = List[PhysicalTokenBlock]
+    """Result for BlockSpaceManager.can_allocate
    1. Ok: seq_group can be allocated now.
    2. Later: seq_group cannot be allocated.
      The capacity of allocator is larger than seq_group required.
    3. Never: seq_group can never be allocated.
      The seq_group is too large to allocated in GPU.
    """
    OK = enum.auto()
    LATER = enum.auto()
    NEVER = enum.auto()
 class BlockSpaceManager:
@@ -86,7 +100,7 @@ class BlockSpaceManager:
        # Mapping: seq_id -> BlockTable.
        self.block_tables: Dict[int, BlockTable] = {}
-    def can_allocate(self, seq_group: SequenceGroup) -> bool:
+    def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
        # FIXME(woosuk): Here we assume that all sequences in the group share
        # the same prompt. This may not be true for preempted sequences.
        seq = seq_group.get_seqs()[0]
@@ -95,9 +109,15 @@ class BlockSpaceManager:
            num_required_blocks = min(num_required_blocks,
                                      self.block_sliding_window)
        num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks()
        # Use watermark to avoid frequent cache eviction.
-        return (num_free_gpu_blocks - num_required_blocks >=
+        if (self.num_total_gpu_blocks - num_required_blocks <
-                self.watermark_blocks)
+                self.watermark_blocks):
            return AllocStatus.NEVER
        if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks:
            return AllocStatus.OK
        else:
            return AllocStatus.LATER
    def allocate(self, seq_group: SequenceGroup) -> None:
        # NOTE: Here we assume that all sequences in the group have the same
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -3,7 +3,7 @@ import time
 from typing import Dict, Iterable, List, Optional, Tuple, Union
 from vllm.config import CacheConfig, SchedulerConfig
-from vllm.core.block_manager import BlockSpaceManager
+from vllm.core.block_manager import AllocStatus, BlockSpaceManager
 from vllm.core.policy import PolicyFactory
 from vllm.logger import init_logger
 from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
@@ -121,7 +121,7 @@ class Scheduler:
        blocks_to_copy: Dict[int, List[int]] = {}
        # Fix the current time.
-        now = time.time()
+        now = time.monotonic()
        # Join waiting sequences if possible.
        if not self.swapped:
@@ -131,7 +131,8 @@ class Scheduler:
            # requests in the generation phase.
            num_curr_seqs = sum(seq_group.get_max_num_running_seqs()
                                for seq_group in self.running)
-            num_batched_tokens = 0
+            seq_lens: List[int] = []
            # Optimization: We do not sort the waiting queue since the preempted
            # sequence groups are added to the front and the new sequence groups
            # are added to the back.
@@ -153,11 +154,23 @@ class Scheduler:
                    continue
                # If the sequence group cannot be allocated, stop.
-                if not self.block_manager.can_allocate(seq_group):
+                can_allocate = self.block_manager.can_allocate(seq_group)
                if can_allocate == AllocStatus.LATER:
                    break
                elif can_allocate == AllocStatus.NEVER:
                    logger.warning(
                        f"Input prompt ({num_prompt_tokens} tokens) is too long"
                        f" and exceeds the capacity of block_manager")
                    for seq in seq_group.get_seqs():
                        seq.status = SequenceStatus.FINISHED_IGNORED
                    ignored_seq_groups.append(seq_group)
                    self.waiting.pop(0)
                    continue
                # If the number of batched tokens exceeds the limit, stop.
-                if (num_batched_tokens + num_prompt_tokens >
+                new_seq_lens = seq_lens + [num_prompt_tokens]
                num_batched_tokens = len(new_seq_lens) * max(new_seq_lens)
                if (num_batched_tokens >
                        self.scheduler_config.max_num_batched_tokens):
                    break
@@ -168,10 +181,14 @@ class Scheduler:
                        self.scheduler_config.max_num_seqs):
                    break
                num_paddings = num_batched_tokens - sum(new_seq_lens)
                if num_paddings > self.scheduler_config.max_paddings:
                    break
                seq_lens = new_seq_lens
                seq_group = self.waiting.pop(0)
                self._allocate(seq_group)
                self.running.append(seq_group)
                num_batched_tokens += num_prompt_tokens
                num_curr_seqs += num_new_seqs
                scheduled.append(seq_group)
@@ -179,7 +196,8 @@ class Scheduler:
                scheduler_outputs = SchedulerOutputs(
                    scheduled_seq_groups=scheduled,
                    prompt_run=True,
-                    num_batched_tokens=num_batched_tokens,
+                    num_batched_tokens=len(seq_lens) *
                    max(seq_lens) if seq_lens else 0,
                    blocks_to_swap_in=blocks_to_swap_in,
                    blocks_to_swap_out=blocks_to_swap_out,
                    blocks_to_copy=blocks_to_copy,
@@ -268,7 +286,7 @@ class Scheduler:
        # Create input data structures.
        seq_group_metadata_list: List[SequenceGroupMetadata] = []
        for seq_group in scheduler_outputs.scheduled_seq_groups:
-            seq_data: Dict[int, List[SequenceData]] = {}
+            seq_data: Dict[int, SequenceData] = {}
            block_tables: Dict[int, List[int]] = {}
            for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
                seq_id = seq.seq_id
@@ -343,7 +361,7 @@ class Scheduler:
        elif preemption_mode == PreemptionMode.SWAP:
            self._preempt_by_swap(seq_group, blocks_to_swap_out)
        else:
-            assert False, "Invalid preemption mode."
+            raise AssertionError("Invalid preemption mode.")
    def _preempt_by_recompute(
        self,
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -22,13 +22,16 @@ class EngineArgs:
    worker_use_ray: bool = False
    pipeline_parallel_size: int = 1
    tensor_parallel_size: int = 1
    max_parallel_loading_workers: Optional[int] = None
    block_size: int = 16
    swap_space: int = 4  # GiB
    gpu_memory_utilization: float = 0.90
    max_num_batched_tokens: Optional[int] = None
    max_num_seqs: int = 256
    max_paddings: int = 256
    disable_log_stats: bool = False
    revision: Optional[str] = None
    tokenizer_revision: Optional[str] = None
    quantization: Optional[str] = None
    def __post_init__(self):
@@ -39,6 +42,10 @@ class EngineArgs:
    def add_cli_args(
            parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
        """Shared CLI arguments for vLLM engine."""
        # NOTE: If you update any of the arguments below, please also
        # make sure to update docs/source/models/engine_args.rst
        # Model arguments
        parser.add_argument(
            '--model',
@@ -57,6 +64,13 @@ class EngineArgs:
            help='the specific model version to use. It can be a branch '
            'name, a tag name, or a commit id. If unspecified, will use '
            'the default version.')
        parser.add_argument(
            '--tokenizer-revision',
            type=str,
            default=None,
            help='the specific tokenizer version to use. It can be a branch '
            'name, a tag name, or a commit id. If unspecified, will use '
            'the default version.')
        parser.add_argument('--tokenizer-mode',
                            type=str,
                            default=EngineArgs.tokenizer_mode,
@@ -119,6 +133,12 @@ class EngineArgs:
                            type=int,
                            default=EngineArgs.tensor_parallel_size,
                            help='number of tensor parallel replicas')
        parser.add_argument(
            '--max-parallel-loading-workers',
            type=int,
            help='load model sequentially in multiple batches, '
            'to avoid RAM OOM when using tensor '
            'parallel and large models')
        # KV cache arguments
        parser.add_argument('--block-size',
                            type=int,
@@ -148,6 +168,10 @@ class EngineArgs:
                            type=int,
                            default=EngineArgs.max_num_seqs,
                            help='maximum number of sequences per iteration')
        parser.add_argument('--max-paddings',
                            type=int,
                            default=EngineArgs.max_paddings,
                            help='maximum number of paddings in a batch')
        parser.add_argument('--disable-log-stats',
                            action='store_true',
                            help='disable logging statistics')
@@ -155,7 +179,7 @@ class EngineArgs:
        parser.add_argument('--quantization',
                            '-q',
                            type=str,
-                            choices=['awq', None],
+                            choices=['awq', 'squeezellm', None],
                            default=None,
                            help='Method used to quantize the weights')
        return parser
@@ -175,16 +199,20 @@ class EngineArgs:
                                   self.tokenizer_mode, self.trust_remote_code,
                                   self.download_dir, self.load_format,
                                   self.dtype, self.seed, self.revision,
-                                   self.max_model_len, self.quantization)
+                                   self.tokenizer_revision, self.max_model_len,
-        cache_config = CacheConfig(
+                                   self.quantization)
-            self.block_size, self.gpu_memory_utilization, self.swap_space,
+        cache_config = CacheConfig(self.block_size,
-            getattr(model_config.hf_config, 'sliding_window', None))
+                                   self.gpu_memory_utilization,
                                   self.swap_space,
                                   model_config.get_sliding_window())
        parallel_config = ParallelConfig(self.pipeline_parallel_size,
                                         self.tensor_parallel_size,
-                                         self.worker_use_ray)
+                                         self.worker_use_ray,
                                         self.max_parallel_loading_workers)
        scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
                                           self.max_num_seqs,
-                                           model_config.max_model_len)
+                                           model_config.max_model_len,
                                           self.max_paddings)
        return model_config, cache_config, parallel_config, scheduler_config
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -142,10 +142,10 @@ class RequestTracker:
        self._request_streams[request_id].finish()
-    def get_new_and_finished_requests(self) -> Tuple[List[dict], Set[str]]:
+    def get_new_and_finished_requests(self) -> Tuple[List[Dict], Set[str]]:
        """Get the new requests and finished requests to be
        sent to the engine."""
-        new_requests: List[dict] = []
+        new_requests: List[Dict] = []
        finished_requests: Set[str] = set()
        while not self._finished_requests.empty():
@@ -206,18 +206,17 @@ class _AsyncLLMEngine(LLMEngine):
        **kwargs,
    ) -> Any:
        """Runs the given method on all workers."""
-        all_outputs = []
+        coros = []
        for worker in self.workers:
            if self.parallel_config.worker_use_ray:
-                executor = partial(worker.execute_method.remote, method)
+                coros.append(
                    worker.execute_method.remote(method, *args, **kwargs))
            else:
                executor = getattr(worker, method)
                coros.append(asyncio.get_event_loop().run_in_executor(
                    None, partial(executor, *args, **kwargs)))
-            output = executor(*args, **kwargs)
+        all_outputs = await asyncio.gather(*coros)
            all_outputs.append(output)
        if self.parallel_config.worker_use_ray:
            all_outputs = await asyncio.gather(*all_outputs)
        if get_all_outputs:
            return all_outputs
@@ -302,7 +301,16 @@ class AsyncLLMEngine:
        elif self.worker_use_ray:
            engine_class = ray.remote(num_cpus=0)(self._engine_class).remote
        else:
-            engine_class = ray.remote(num_gpus=1)(self._engine_class).remote
+            # FIXME(woosuk): This is a bit hacky. Be careful when changing the
            # order of the arguments.
            cache_config = args[1]
            parallel_config = args[2]
            if parallel_config.tensor_parallel_size == 1:
                num_gpus = cache_config.gpu_memory_utilization
            else:
                num_gpus = 1
            engine_class = ray.remote(num_gpus=num_gpus)(
                self._engine_class).remote
        return engine_class(*args, **kwargs)
    async def engine_step(self) -> bool:
@@ -417,7 +425,8 @@ class AsyncLLMEngine:
            request.
        """
        # Preprocess the request.
-        arrival_time = time.time()
+        # This should not be used for logging, as it is monotonic time.
        arrival_time = time.monotonic()
        try:
            stream = await self.add_request(request_id,
@@ -483,7 +492,7 @@ class AsyncLLMEngine:
        distributed_init_method, placement_group = initialize_cluster(
            parallel_config, engine_args.engine_use_ray)
        # Create the async LLM engine.
-        engine = cls(engine_args.worker_use_ray,
+        engine = cls(parallel_config.worker_use_ray,
                     engine_args.engine_use_ray,
                     *engine_configs,
                     distributed_init_method,
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -7,13 +7,14 @@ from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
                         SchedulerConfig)
 from vllm.core.scheduler import Scheduler, SchedulerOutputs
 from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.ray_utils import RayWorker, initialize_cluster, ray
+from vllm.engine.metrics import record_metrics
 from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (SamplerOutput, Sequence, SequenceGroup,
-                           SequenceGroupMetadata, SequenceOutputs,
+                           SequenceGroupMetadata, SequenceGroupOutput,
-                           SequenceStatus)
+                           SequenceOutput, SequenceStatus)
 from vllm.transformers_utils.tokenizer import (detokenize_incrementally,
                                               get_tokenizer)
 from vllm.utils import Counter
@@ -75,6 +76,7 @@ class LLMEngine:
            f"tokenizer={model_config.tokenizer!r}, "
            f"tokenizer_mode={model_config.tokenizer_mode}, "
            f"revision={model_config.revision}, "
            f"tokenizer_revision={model_config.tokenizer_revision}, "
            f"trust_remote_code={model_config.trust_remote_code}, "
            f"dtype={model_config.dtype}, "
            f"max_seq_len={model_config.max_model_len}, "
@@ -87,8 +89,6 @@ class LLMEngine:
        self.model_config = model_config
        self.cache_config = cache_config
        assert self.cache_config.sliding_window == getattr(
            self.model_config.hf_config, "sliding_window", None)
        self.parallel_config = parallel_config
        self.scheduler_config = scheduler_config
        self.log_stats = log_stats
@@ -98,6 +98,7 @@ class LLMEngine:
            model_config.tokenizer,
            tokenizer_mode=model_config.tokenizer_mode,
            trust_remote_code=model_config.trust_remote_code,
            tokenizer_revision=model_config.tokenizer_revision,
            revision=model_config.revision)
        self.seq_counter = Counter()
@@ -123,7 +124,7 @@ class LLMEngine:
    def _init_workers(self, distributed_init_method: str):
        # Lazy import the Worker to avoid importing torch.cuda/xformers
        # before CUDA_VISIBLE_DEVICES is set in the Worker
-        from vllm.worker.worker import Worker  # pylint: disable=import-outside-toplevel
+        from vllm.worker.worker import Worker
        assert self.parallel_config.world_size == 1, (
            "Ray is required if parallel_config.world_size > 1.")
@@ -141,25 +142,35 @@ class LLMEngine:
            "init_model",
            get_all_outputs=True,
        )
        self._run_workers(
            "load_model",
            get_all_outputs=True,
            max_concurrent_workers=self.parallel_config.
            max_parallel_loading_workers,
        )
    def _init_workers_ray(self, placement_group: "PlacementGroup",
                          **ray_remote_kwargs):
        # Lazy import the Worker to avoid importing torch.cuda/xformers
        # before CUDA_VISIBLE_DEVICES is set in the Worker
-        from vllm.worker.worker import Worker  # pylint: disable=import-outside-toplevel
+        from vllm.worker.worker import Worker
        self.workers: List[Worker] = []
        for bundle in placement_group.bundle_specs:
            if not bundle.get("GPU", 0):
                continue
            if self.parallel_config.tensor_parallel_size == 1:
                num_gpus = self.cache_config.gpu_memory_utilization
            else:
                num_gpus = 1
            worker = ray.remote(
                num_cpus=0,
-                num_gpus=1,
+                num_gpus=num_gpus,
                scheduling_strategy=PlacementGroupSchedulingStrategy(
                    placement_group=placement_group,
                    placement_group_capture_child_tasks=True),
                **ray_remote_kwargs,
-            )(RayWorker).remote(self.model_config.trust_remote_code)
+            )(RayWorkerVllm).remote(self.model_config.trust_remote_code)
            self.workers.append(worker)
        # Initialize torch distributed process group for the workers.
@@ -180,6 +191,12 @@ class LLMEngine:
            "init_model",
            get_all_outputs=True,
        )
        self._run_workers(
            "load_model",
            get_all_outputs=True,
            max_concurrent_workers=self.parallel_config.
            max_parallel_loading_workers,
        )
    def _verify_args(self) -> None:
        self.model_config.verify_with_parallel_config(self.parallel_config)
@@ -254,10 +271,10 @@ class LLMEngine:
            prompt_token_ids: The token IDs of the prompt. If None, we
                use the tokenizer to convert the prompts to token IDs.
            arrival_time: The arrival time of the request. If None, we use
-                the current time.
+                the current monotonic time.
        """
        if arrival_time is None:
-            arrival_time = time.time()
+            arrival_time = time.monotonic()
        if prompt_token_ids is None:
            assert prompt is not None
            prompt_token_ids = self.tokenizer.encode(prompt)
@@ -348,9 +365,15 @@ class LLMEngine:
                        eos_token_id=self.tokenizer.eos_token_id))
        return current_worst_score >= highest_attainable_score
-    def _process_sequence_group_samples(
+    def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
-            self, seq_group: SequenceGroup,
+                                        outputs: SequenceGroupOutput) -> None:
-            samples: List[SequenceOutputs]) -> None:
+        # Process prompt logprobs
        prompt_logprobs = outputs.prompt_logprobs
        if prompt_logprobs is not None:
            seq_group.prompt_logprobs = prompt_logprobs
        # Process samples
        samples = outputs.samples
        parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
        existing_finished_seqs = seq_group.get_finished_seqs()
        parent_child_dict = {
@@ -364,7 +387,7 @@ class LLMEngine:
        # Process the child samples for each parent sequence
        for parent in parent_seqs:
-            child_samples: List[SequenceOutputs] = parent_child_dict[
+            child_samples: List[SequenceOutput] = parent_child_dict[
                parent.seq_id]
            if len(child_samples) == 0:
                # This parent sequence has no children samples. Remove
@@ -518,8 +541,8 @@ class LLMEngine:
            scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]:
        # Update the scheduled sequence groups with the model outputs.
        scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups
-        for seq_group, samples in zip(scheduled_seq_groups, output):
+        for seq_group, outputs in zip(scheduled_seq_groups, output):
-            self._process_sequence_group_samples(seq_group, samples)
+            self._process_sequence_group_outputs(seq_group, outputs)
        # Free the finished sequence groups.
        self.scheduler.free_finished_seq_groups()
@@ -559,22 +582,22 @@ class LLMEngine:
            blocks_to_copy=scheduler_outputs.blocks_to_copy,
        )
-        return self._process_model_outputs(output, scheduler_outputs) + ignored
+        return self._process_model_outputs(output, scheduler_outputs)
    def _log_system_stats(
        self,
        prompt_run: bool,
        num_batched_tokens: int,
    ) -> None:
-        now = time.time()
+        now = time.monotonic()
        # Log the number of batched input tokens.
        if prompt_run:
            self.num_prompt_tokens.append((now, num_batched_tokens))
        else:
            self.num_generation_tokens.append((now, num_batched_tokens))
-        elapsed_time = now - self.last_logging_time
+        should_log = now - self.last_logging_time >= _LOGGING_INTERVAL_SEC
-        if elapsed_time < _LOGGING_INTERVAL_SEC:
+        if not should_log:
            return
        # Discard the old stats.
@@ -613,6 +636,16 @@ class LLMEngine:
        else:
            cpu_cache_usage = 0.0
        record_metrics(
            avg_prompt_throughput=avg_prompt_throughput,
            avg_generation_throughput=avg_generation_throughput,
            scheduler_running=len(self.scheduler.running),
            scheduler_swapped=len(self.scheduler.swapped),
            scheduler_waiting=len(self.scheduler.waiting),
            gpu_cache_usage=gpu_cache_usage,
            cpu_cache_usage=cpu_cache_usage,
        )
        logger.info("Avg prompt throughput: "
                    f"{avg_prompt_throughput:.1f} tokens/s, "
                    "Avg generation throughput: "
@@ -624,8 +657,7 @@ class LLMEngine:
                    f"CPU KV cache usage: {cpu_cache_usage * 100:.1f}%")
        self.last_logging_time = now
-    def _decode_sequence(self, seq: Sequence,
+    def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None:
                         sampling_params: SamplingParams) -> None:
        """Decodes the new token for a sequence."""
        (new_tokens, new_output_text, prefix_offset,
         read_offset) = detokenize_incrementally(
@@ -634,7 +666,8 @@ class LLMEngine:
             prev_tokens=seq.tokens,
             prefix_offset=seq.prefix_offset,
             read_offset=seq.read_offset,
-             skip_special_tokens=sampling_params.skip_special_tokens,
+             skip_special_tokens=prms.skip_special_tokens,
             spaces_between_special_tokens=prms.spaces_between_special_tokens,
         )
        if seq.tokens is None:
            seq.tokens = new_tokens
@@ -674,16 +707,15 @@ class LLMEngine:
            seq.status = SequenceStatus.FINISHED_STOPPED
            return
-    def _run_workers(
+    def _run_workers_in_batch(
        self,
        workers,
        method: str,
        *args,
        get_all_outputs: bool = False,
        **kwargs,
-    ) -> Any:
+    ):
        """Runs the given method on all workers."""
        all_outputs = []
-        for worker in self.workers:
+        for worker in workers:
            if self.parallel_config.worker_use_ray:
                executor = partial(worker.execute_method.remote, method)
            else:
@@ -691,9 +723,31 @@ class LLMEngine:
            output = executor(*args, **kwargs)
            all_outputs.append(output)
        if self.parallel_config.worker_use_ray:
            all_outputs = ray.get(all_outputs)
        return all_outputs
    def _run_workers(
        self,
        method: str,
        *args,
        get_all_outputs: bool = False,
        max_concurrent_workers: Optional[int] = None,
        **kwargs,
    ) -> Any:
        """Runs the given method on all workers."""
        all_outputs = []
        if max_concurrent_workers:
            work_groups = [
                self.workers[i:i + max_concurrent_workers]
                for i in range(0, len(self.workers), max_concurrent_workers)
            ]
        else:
            work_groups = [self.workers]
        for workers in work_groups:
            all_outputs.extend(
                self._run_workers_in_batch(workers, method, *args, **kwargs))
        if get_all_outputs:
            return all_outputs
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -0,0 +1,51 @@
 from aioprometheus import Gauge
 # The begin-* and end* here are used by the documentation generator
 # to extract the metrics definitions.
 # begin-metrics-definitions
 gauge_avg_prompt_throughput = Gauge("vllm:avg_prompt_throughput_toks_per_s",
                                    "Average prefill throughput in tokens/s.")
 gauge_avg_generation_throughput = Gauge(
    "vllm:avg_generation_throughput_toks_per_s",
    "Average generation throughput in tokens/s.")
 gauge_scheduler_running = Gauge(
    "vllm:num_requests_running",
    "Number of requests that is currently running for inference.")
 gauge_scheduler_swapped = Gauge("vllm:num_requests_swapped",
                                "Number requests swapped to CPU.")
 gauge_scheduler_waiting = Gauge("vllm:num_requests_waiting",
                                "Number of requests waiting to be processed.")
 gauge_gpu_cache_usage = Gauge(
    "vllm:gpu_cache_usage_perc",
    "GPU KV-cache usage. 1 means 100 percent usage.")
 gauge_cpu_cache_usage = Gauge(
    "vllm:cpu_cache_usage_perc",
    "CPU KV-cache usage. 1 means 100 percent usage.")
 # end-metrics-definitions
 labels = {}
 def add_global_metrics_labels(**kwargs):
    labels.update(kwargs)
 def record_metrics(
    avg_prompt_throughput: float,
    avg_generation_throughput: float,
    scheduler_running: int,
    scheduler_swapped: int,
    scheduler_waiting: int,
    gpu_cache_usage: float,
    cpu_cache_usage: float,
 ):
    gauge_avg_prompt_throughput.set(labels, avg_prompt_throughput)
    gauge_avg_generation_throughput.set(labels, avg_generation_throughput)
    gauge_scheduler_running.set(labels, scheduler_running)
    gauge_scheduler_swapped.set(labels, scheduler_swapped)
    gauge_scheduler_waiting.set(labels, scheduler_waiting)
    gauge_gpu_cache_usage.set(labels, gpu_cache_usage)
    gauge_cpu_cache_usage.set(labels, cpu_cache_usage)
--- a/vllm/engine/ray_utils.py
+++ b/vllm/engine/ray_utils.py
@@ -10,13 +10,12 @@ try:
    import ray
    from ray.air.util.torch_dist import TorchDistributedWorker
-    class RayWorker(TorchDistributedWorker):
+    class RayWorkerVllm(TorchDistributedWorker):
        """Ray wrapper for vllm.worker.Worker, allowing Worker to be
        lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""
        def __init__(self, init_cached_hf_modules=False) -> None:
            if init_cached_hf_modules:
                # pylint: disable=import-outside-toplevel
                from transformers.dynamic_module_utils import init_hf_modules
                init_hf_modules()
            self.worker = None
@@ -37,7 +36,7 @@ except ImportError as e:
                   "`pip install ray pandas pyarrow`.")
    ray = None
    TorchDistributedWorker = None
-    RayWorker = None  # pylint: disable=invalid-name
+    RayWorkerVllm = None
 if TYPE_CHECKING:
    from ray.util.placement_group import PlacementGroup
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -17,6 +17,12 @@ app = FastAPI()
 engine = None
@app.get("/health")
 async def health() -> Response:
    """Health check."""
    return Response(status_code=200)
@app.post("/generate")
 async def generate(request: Request) -> Response:
    """Generate completion for the request.
@@ -65,7 +71,7 @@ async def generate(request: Request) -> Response:
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--host", type=str, default=None)
    parser.add_argument("--port", type=int, default=8000)
    parser = AsyncEngineArgs.add_cli_args(parser)
    args = parser.parse_args()
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -42,6 +42,8 @@ class LLM:
            quantized and use `dtype` to determine the data type of the weights.
        revision: The specific model version to use. It can be a branch name,
            a tag name, or a commit id.
        tokenizer_revision: The specific tokenizer version to use. It can be a
            branch name, a tag name, or a commit id.
        seed: The seed to initialize the random number generator for sampling.
        gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
            reserve for the model weights, activations, and KV cache. Higher
@@ -65,6 +67,7 @@ class LLM:
        dtype: str = "auto",
        quantization: Optional[str] = None,
        revision: Optional[str] = None,
        tokenizer_revision: Optional[str] = None,
        seed: int = 0,
        gpu_memory_utilization: float = 0.9,
        swap_space: int = 4,
@@ -81,6 +84,7 @@ class LLM:
            dtype=dtype,
            quantization=quantization,
            revision=revision,
            tokenizer_revision=tokenizer_revision,
            seed=seed,
            gpu_memory_utilization=gpu_memory_utilization,
            swap_space=swap_space,
@@ -130,25 +134,21 @@ class LLM:
        if isinstance(prompts, str):
            # Convert a single prompt to a list.
            prompts = [prompts]
-        if prompts is not None and prompt_token_ids is not None:
+        if (prompts is not None and prompt_token_ids is not None
-            if len(prompts) != len(prompt_token_ids):
+                and len(prompts) != len(prompt_token_ids)):
-                raise ValueError("The lengths of prompts and prompt_token_ids "
+            raise ValueError("The lengths of prompts and prompt_token_ids "
-                                 "must be the same.")
+                             "must be the same.")
        if sampling_params is None:
            # Use default sampling params.
            sampling_params = SamplingParams()
        # Add requests to the engine.
-        if prompts is not None:
+        num_requests = len(prompts) if prompts is not None else len(
-            num_requests = len(prompts)
+            prompt_token_ids)
        else:
            num_requests = len(prompt_token_ids)
        for i in range(num_requests):
            prompt = prompts[i] if prompts is not None else None
-            if prompt_token_ids is None:
+            token_ids = None if prompt_token_ids is None else prompt_token_ids[
-                token_ids = None
+                i]
            else:
                token_ids = prompt_token_ids[i]
            self._add_request(prompt, sampling_params, token_ids)
        return self._run_engine(use_tqdm)
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -3,21 +3,24 @@
 import argparse
 import asyncio
 import codecs
 import json
 import time
 from http import HTTPStatus
 from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union
 from aioprometheus import MetricsMiddleware
 from aioprometheus.asgi.starlette import metrics
 import fastapi
 import uvicorn
 from fastapi import Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse, StreamingResponse
+from fastapi.responses import JSONResponse, StreamingResponse, Response
 from packaging import version
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.metrics import add_global_metrics_labels
 from vllm.entrypoints.openai.protocol import (
    CompletionRequest, CompletionResponse, CompletionResponseChoice,
    CompletionResponseStreamChoice, CompletionStreamResponse,
@@ -31,20 +34,59 @@ from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils import random_uuid
 try:
    import fastchat
    from fastchat.conversation import Conversation, SeparatorStyle
    from fastchat.model.model_adapter import get_conversation_template
    _fastchat_available = True
 except ImportError:
    _fastchat_available = False
 TIMEOUT_KEEP_ALIVE = 5  # seconds
 logger = init_logger(__name__)
 served_model = None
 app = fastapi.FastAPI()
 engine = None
 response_role = None
 def parse_args():
    parser = argparse.ArgumentParser(
        description="vLLM OpenAI-Compatible RESTful API server.")
    parser.add_argument("--host", type=str, default=None, help="host name")
    parser.add_argument("--port", type=int, default=8000, help="port number")
    parser.add_argument("--allow-credentials",
                        action="store_true",
                        help="allow credentials")
    parser.add_argument("--allowed-origins",
                        type=json.loads,
                        default=["*"],
                        help="allowed origins")
    parser.add_argument("--allowed-methods",
                        type=json.loads,
                        default=["*"],
                        help="allowed methods")
    parser.add_argument("--allowed-headers",
                        type=json.loads,
                        default=["*"],
                        help="allowed headers")
    parser.add_argument("--served-model-name",
                        type=str,
                        default=None,
                        help="The model name used in the API. If not "
                        "specified, the model name will be the same as "
                        "the huggingface name.")
    parser.add_argument("--chat-template",
                        type=str,
                        default=None,
                        help="The file path to the chat template, "
                        "or the template in single-line form "
                        "for the specified model")
    parser.add_argument("--response-role",
                        type=str,
                        default="assistant",
                        help="The role name to return if "
                        "`request.add_generation_prompt=true`.")
    parser = AsyncEngineArgs.add_cli_args(parser)
    return parser.parse_args()
 app.add_middleware(MetricsMiddleware)  # Trace HTTP server metrics
 app.add_route("/metrics", metrics)  # Exposes HTTP metrics
 def create_error_response(status_code: HTTPStatus,
@@ -54,8 +96,27 @@ def create_error_response(status_code: HTTPStatus,
                        status_code=status_code.value)
 def load_chat_template(args, tokenizer):
    if args.chat_template is not None:
        try:
            with open(args.chat_template, "r") as f:
                chat_template = f.read()
        except OSError:
            # If opening a file fails, set chat template to be args to
            # ensure we decode so our escape are interpreted correctly
            chat_template = codecs.decode(args.chat_template, "unicode_escape")
        tokenizer.chat_template = chat_template
        logger.info(
            f"Using supplied chat template:\n{tokenizer.chat_template}")
    elif tokenizer.chat_template is not None:
        logger.info(f"Using default chat template:\n{tokenizer.chat_template}")
    else:
        logger.warning("No chat template provided. Chat API will not work.")
@app.exception_handler(RequestValidationError)
-async def validation_exception_handler(request, exc):  # pylint: disable=unused-argument
+async def validation_exception_handler(_, exc):
    return create_error_response(HTTPStatus.BAD_REQUEST, str(exc))
@@ -69,53 +130,6 @@ async def check_model(request) -> Optional[JSONResponse]:
    return ret
 async def get_gen_prompt(request) -> str:
    if not _fastchat_available:
        raise ModuleNotFoundError(
            "fastchat is not installed. Please install fastchat to use "
            "the chat completion and conversation APIs: `$ pip install fschat`"
        )
    if version.parse(fastchat.__version__) < version.parse("0.2.23"):
        raise ImportError(
            f"fastchat version is low. Current version: {fastchat.__version__} "
            "Please upgrade fastchat to use: `$ pip install -U fschat`")
    conv = get_conversation_template(request.model)
    conv = Conversation(
        name=conv.name,
        system_template=conv.system_template,
        system_message=conv.system_message,
        roles=conv.roles,
        messages=list(conv.messages),  # prevent in-place modification
        offset=conv.offset,
        sep_style=SeparatorStyle(conv.sep_style),
        sep=conv.sep,
        sep2=conv.sep2,
        stop_str=conv.stop_str,
        stop_token_ids=conv.stop_token_ids,
    )
    if isinstance(request.messages, str):
        prompt = request.messages
    else:
        for message in request.messages:
            msg_role = message["role"]
            if msg_role == "system":
                conv.system_message = message["content"]
            elif msg_role == "user":
                conv.append_message(conv.roles[0], message["content"])
            elif msg_role == "assistant":
                conv.append_message(conv.roles[1], message["content"])
            else:
                raise ValueError(f"Unknown role: {msg_role}")
        # Add a blank message for the assistant.
        conv.append_message(conv.roles[1], None)
        prompt = conv.get_prompt()
    return prompt
 async def check_length(
    request: Union[ChatCompletionRequest, CompletionRequest],
    prompt: Optional[str] = None,
@@ -124,10 +138,8 @@ async def check_length(
    assert (not (prompt is None and prompt_ids is None)
            and not (prompt is not None and prompt_ids is not None)
            ), "Either prompt or prompt_ids should be provided."
-    if prompt_ids is not None:
+    input_ids = prompt_ids if prompt_ids is not None else tokenizer(
-        input_ids = prompt_ids
+        prompt).input_ids
    else:
        input_ids = tokenizer(prompt).input_ids
    token_num = len(input_ids)
    if request.max_tokens is None:
@@ -145,6 +157,12 @@ async def check_length(
        return input_ids, None
@app.get("/health")
 async def health() -> Response:
    """Health check."""
    return Response(status_code=200)
@app.get("/v1/models")
 async def show_available_models():
    """Show available models. Right now we only have one model."""
@@ -156,16 +174,26 @@ async def show_available_models():
    return ModelList(data=model_cards)
-def create_logprobs(token_ids: List[int],
+def create_logprobs(
-                    id_logprobs: List[Dict[int, float]],
+    token_ids: List[int],
-                    initial_text_offset: int = 0) -> LogProbs:
+    top_logprobs: Optional[List[Optional[Dict[int, float]]]] = None,
    num_output_top_logprobs: Optional[int] = None,
    initial_text_offset: int = 0,
 ) -> LogProbs:
    """Create OpenAI-style logprobs."""
    logprobs = LogProbs()
    last_token_len = 0
-    for token_id, id_logprob in zip(token_ids, id_logprobs):
+    if num_output_top_logprobs:
        logprobs.top_logprobs = []
    for i, token_id in enumerate(token_ids):
        step_top_logprobs = top_logprobs[i]
        if step_top_logprobs is not None:
            token_logprob = step_top_logprobs[token_id]
        else:
            token_logprob = None
        token = tokenizer.convert_ids_to_tokens(token_id)
        logprobs.tokens.append(token)
-        logprobs.token_logprobs.append(id_logprob[token_id])
+        logprobs.token_logprobs.append(token_logprob)
        if len(logprobs.text_offset) == 0:
            logprobs.text_offset.append(initial_text_offset)
        else:
@@ -173,10 +201,11 @@ def create_logprobs(token_ids: List[int],
                                        last_token_len)
        last_token_len = len(token)
-        logprobs.top_logprobs.append({
+        if num_output_top_logprobs:
-            tokenizer.convert_ids_to_tokens(i): p
+            logprobs.top_logprobs.append({
-            for i, p in id_logprob.items()
+                tokenizer.convert_ids_to_tokens(i): p
-        })
+                for i, p in step_top_logprobs.items()
            } if step_top_logprobs else None)
    return logprobs
@@ -192,8 +221,6 @@ async def create_chat_completion(request: ChatCompletionRequest,
        - function_call (Users should implement this by themselves)
        - logit_bias (to be supported by vLLM engine)
    """
    logger.info(f"Received chat completion request: {request}")
    error_check_ret = await check_model(request)
    if error_check_ret is not None:
        return error_check_ret
@@ -203,15 +230,25 @@ async def create_chat_completion(request: ChatCompletionRequest,
        return create_error_response(HTTPStatus.BAD_REQUEST,
                                     "logit_bias is not currently supported")
-    prompt = await get_gen_prompt(request)
+    try:
        prompt = tokenizer.apply_chat_template(
            conversation=request.messages,
            tokenize=False,
            add_generation_prompt=request.add_generation_prompt)
    except Exception as e:
        logger.error(f"Error in applying chat template from request: {str(e)}")
        return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
    token_ids, error_check_ret = await check_length(request, prompt=prompt)
    if error_check_ret is not None:
        return error_check_ret
    model_name = request.model
    request_id = f"cmpl-{random_uuid()}"
-    created_time = int(time.time())
+    created_time = int(time.monotonic())
    chunk_object_type = "chat.completion.chunk"
    try:
        spaces_between_special_tokens = request.spaces_between_special_tokens
        sampling_params = SamplingParams(
            n=request.n,
            presence_penalty=request.presence_penalty,
@@ -226,6 +263,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
            ignore_eos=request.ignore_eos,
            use_beam_search=request.use_beam_search,
            skip_special_tokens=request.skip_special_tokens,
            spaces_between_special_tokens=spaces_between_special_tokens,
        )
    except ValueError as e:
        return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
@@ -233,116 +271,162 @@ async def create_chat_completion(request: ChatCompletionRequest,
    result_generator = engine.generate(prompt, sampling_params, request_id,
                                       token_ids)
-    def create_stream_response_json(
+    def get_role() -> str:
-        index: int,
+        if request.add_generation_prompt:
-        text: str,
+            return response_role
-        finish_reason: Optional[str] = None,
+        else:
-    ) -> str:
+            return request.messages[-1]["role"]
        choice_data = ChatCompletionResponseStreamChoice(
            index=index,
            delta=DeltaMessage(content=text),
            finish_reason=finish_reason,
        )
        response = ChatCompletionStreamResponse(
            id=request_id,
            created=created_time,
            model=model_name,
            choices=[choice_data],
        )
        response_json = response.json(ensure_ascii=False)
        return response_json
    async def completion_stream_generator() -> AsyncGenerator[str, None]:
-        # First chunk with role
+        # Send first response for each request.n (index) with the role
        role = get_role()
        for i in range(request.n):
            choice_data = ChatCompletionResponseStreamChoice(
-                index=i,
+                index=i, delta=DeltaMessage(role=role), finish_reason=None)
                delta=DeltaMessage(role="assistant"),
                finish_reason=None,
            )
            chunk = ChatCompletionStreamResponse(id=request_id,
                                                 object=chunk_object_type,
                                                 created=created_time,
                                                 choices=[choice_data],
                                                 model=model_name)
            data = chunk.json(exclude_unset=True, ensure_ascii=False)
            yield f"data: {data}\n\n"
        # Send response to echo the input portion of the last message
        if request.echo:
            last_msg_content = ""
            if request.messages and isinstance(
                    request.messages, list) and request.messages[-1].get(
                        "content") and request.messages[-1].get(
                            "role") == role:
                last_msg_content = request.messages[-1]["content"]
            if last_msg_content:
                for i in range(request.n):
                    choice_data = ChatCompletionResponseStreamChoice(
                        index=i,
                        delta=DeltaMessage(content=last_msg_content),
                        finish_reason=None)
                    chunk = ChatCompletionStreamResponse(
                        id=request_id,
                        object=chunk_object_type,
                        created=created_time,
                        choices=[choice_data],
                        model=model_name)
                    data = chunk.json(exclude_unset=True, ensure_ascii=False)
                    yield f"data: {data}\n\n"
        # Send response for each token for each request.n (index)
        previous_texts = [""] * request.n
        previous_num_tokens = [0] * request.n
        finish_reason_sent = [False] * request.n
        async for res in result_generator:
            res: RequestOutput
            for output in res.outputs:
                i = output.index
-                delta_text = output.text[len(previous_texts[i]):]
+
-                previous_texts[i] = output.text
+                if finish_reason_sent[i]:
-                previous_num_tokens[i] = len(output.token_ids)
+                    continue
-                response_json = create_stream_response_json(
+
-                    index=i,
+                if output.finish_reason is None:
-                    text=delta_text,
+                    # Send token-by-token response for each request.n
-                )
+                    delta_text = output.text[len(previous_texts[i]):]
-                yield f"data: {response_json}\n\n"
+                    previous_texts[i] = output.text
-                if output.finish_reason is not None:
+                    completion_tokens = len(output.token_ids)
-                    response_json = create_stream_response_json(
+                    previous_num_tokens[i] = completion_tokens
                    choice_data = ChatCompletionResponseStreamChoice(
                        index=i,
-                        text="",
+                        delta=DeltaMessage(content=delta_text),
-                        finish_reason=output.finish_reason,
+                        finish_reason=None)
                    chunk = ChatCompletionStreamResponse(
                        id=request_id,
                        object=chunk_object_type,
                        created=created_time,
                        choices=[choice_data],
                        model=model_name)
                    data = chunk.json(exclude_unset=True, ensure_ascii=False)
                    yield f"data: {data}\n\n"
                else:
                    # Send the finish response for each request.n only once
                    prompt_tokens = len(res.prompt_token_ids)
                    final_usage = UsageInfo(
                        prompt_tokens=prompt_tokens,
                        completion_tokens=completion_tokens,
                        total_tokens=prompt_tokens + completion_tokens,
                    )
-                    yield f"data: {response_json}\n\n"
+                    choice_data = ChatCompletionResponseStreamChoice(
                        index=i, delta=[], finish_reason=output.finish_reason)
                    chunk = ChatCompletionStreamResponse(
                        id=request_id,
                        object=chunk_object_type,
                        created=created_time,
                        choices=[choice_data],
                        model=model_name)
                    if final_usage is not None:
                        chunk.usage = final_usage
                    data = chunk.json(exclude_unset=True,
                                      exclude_none=True,
                                      ensure_ascii=False)
                    yield f"data: {data}\n\n"
                    finish_reason_sent[i] = True
        # Send the final done message after all response.n are finished
        yield "data: [DONE]\n\n"
    async def completion_full_generator():
        final_res: RequestOutput = None
        async for res in result_generator:
            if await raw_request.is_disconnected():
                # Abort the request if the client disconnects.
                await engine.abort(request_id)
                return create_error_response(HTTPStatus.BAD_REQUEST,
                                             "Client disconnected")
            final_res = res
        assert final_res is not None
        choices = []
        role = get_role()
        for output in final_res.outputs:
            choice_data = ChatCompletionResponseChoice(
                index=output.index,
                message=ChatMessage(role=role, content=output.text),
                finish_reason=output.finish_reason,
            )
            choices.append(choice_data)
        if request.echo:
            last_msg_content = ""
            if request.messages and isinstance(
                    request.messages, list) and request.messages[-1].get(
                        "content") and request.messages[-1].get(
                            "role") == role:
                last_msg_content = request.messages[-1]["content"]
            for choice in choices:
                full_message = last_msg_content + choice.message.content
                choice.message.content = full_message
        num_prompt_tokens = len(final_res.prompt_token_ids)
        num_generated_tokens = sum(
            len(output.token_ids) for output in final_res.outputs)
        usage = UsageInfo(
            prompt_tokens=num_prompt_tokens,
            completion_tokens=num_generated_tokens,
            total_tokens=num_prompt_tokens + num_generated_tokens,
        )
        response = ChatCompletionResponse(
            id=request_id,
            created=created_time,
            model=model_name,
            choices=choices,
            usage=usage,
        )
        return response
    # Streaming response
    if request.stream:
        return StreamingResponse(completion_stream_generator(),
                                 media_type="text/event-stream")
-
+    else:
-    # Non-streaming response
+        return await completion_full_generator()
    final_res: RequestOutput = None
    async for res in result_generator:
        if await raw_request.is_disconnected():
            # Abort the request if the client disconnects.
            await engine.abort(request_id)
            return create_error_response(HTTPStatus.BAD_REQUEST,
                                         "Client disconnected")
        final_res = res
    assert final_res is not None
    choices = []
    for output in final_res.outputs:
        choice_data = ChatCompletionResponseChoice(
            index=output.index,
            message=ChatMessage(role="assistant", content=output.text),
            finish_reason=output.finish_reason,
        )
        choices.append(choice_data)
    num_prompt_tokens = len(final_res.prompt_token_ids)
    num_generated_tokens = sum(
        len(output.token_ids) for output in final_res.outputs)
    usage = UsageInfo(
        prompt_tokens=num_prompt_tokens,
        completion_tokens=num_generated_tokens,
        total_tokens=num_prompt_tokens + num_generated_tokens,
    )
    response = ChatCompletionResponse(
        id=request_id,
        created=created_time,
        model=model_name,
        choices=choices,
        usage=usage,
    )
    if request.stream:
        # When user requests streaming but we don't stream, we still need to
        # return a streaming response with a single event.
        response_json = response.json(ensure_ascii=False)
        async def fake_stream_generator() -> AsyncGenerator[str, None]:
            yield f"data: {response_json}\n\n"
            yield "data: [DONE]\n\n"
        return StreamingResponse(fake_stream_generator(),
                                 media_type="text/event-stream")
    return response
@app.post("/v1/completions")
@@ -353,23 +437,17 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
    for the API specification. This API mimics the OpenAI Completion API.
    NOTE: Currently we do not support the following features:
        - echo (since the vLLM engine does not currently support
          getting the logprobs of prompt tokens)
        - suffix (the language models we currently support do not support
          suffix)
        - logit_bias (to be supported by vLLM engine)
    """
    logger.info(f"Received completion request: {request}")
    error_check_ret = await check_model(request)
    if error_check_ret is not None:
        return error_check_ret
-    if request.echo:
+    # OpenAI API supports echoing the prompt when max_tokens is 0.
-        # We do not support echo since the vLLM engine does not
+    echo_without_generation = request.echo and request.max_tokens == 0
        # currently support getting the logprobs of prompt tokens.
        return create_error_response(HTTPStatus.BAD_REQUEST,
                                     "echo is not currently supported")
    if request.suffix is not None:
        # The language models we currently support do not support suffix.
@@ -411,8 +489,9 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
    if error_check_ret is not None:
        return error_check_ret
-    created_time = int(time.time())
+    created_time = int(time.monotonic())
    try:
        spaces_between_special_tokens = request.spaces_between_special_tokens
        sampling_params = SamplingParams(
            n=request.n,
            best_of=request.best_of,
@@ -424,10 +503,13 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
            stop=request.stop,
            stop_token_ids=request.stop_token_ids,
            ignore_eos=request.ignore_eos,
-            max_tokens=request.max_tokens,
+            max_tokens=request.max_tokens
            if not echo_without_generation else 1,
            logprobs=request.logprobs,
            use_beam_search=request.use_beam_search,
            prompt_logprobs=request.logprobs if request.echo else None,
            skip_special_tokens=request.skip_special_tokens,
            spaces_between_special_tokens=spaces_between_special_tokens,
        )
    except ValueError as e:
        return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
@@ -452,6 +534,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
        text: str,
        logprobs: Optional[LogProbs] = None,
        finish_reason: Optional[str] = None,
        usage: Optional[UsageInfo] = None,
    ) -> str:
        choice_data = CompletionResponseStreamChoice(
            index=index,
@@ -465,41 +548,69 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
            model=model_name,
            choices=[choice_data],
        )
-        response_json = response.json(ensure_ascii=False)
+        if usage is not None:
            response.usage = usage
        response_json = response.json(exclude_unset=True, ensure_ascii=False)
        return response_json
    async def completion_stream_generator() -> AsyncGenerator[str, None]:
        previous_texts = [""] * request.n
        previous_num_tokens = [0] * request.n
        has_echoed = [False] * request.n
        async for res in result_generator:
            res: RequestOutput
            for output in res.outputs:
                i = output.index
                delta_text = output.text[len(previous_texts[i]):]
                token_ids = output.token_ids[previous_num_tokens[i]:]
                top_logprobs = output.logprobs[previous_num_tokens[i]:]
                offsets = len(previous_texts[i])
                if request.echo and not has_echoed[i]:
                    if not echo_without_generation:
                        delta_text = res.prompt + delta_text
                        token_ids = res.prompt_token_ids + token_ids
                        top_logprobs = res.prompt_logprobs + top_logprobs
                    else:
                        delta_text = res.prompt
                        token_ids = res.prompt_token_ids
                        top_logprobs = res.prompt_logprobs
                    has_echoed[i] = True
                if request.logprobs is not None:
                    logprobs = create_logprobs(
-                        output.token_ids[previous_num_tokens[i]:],
+                        token_ids=token_ids,
-                        output.logprobs[previous_num_tokens[i]:],
+                        top_logprobs=top_logprobs,
-                        len(previous_texts[i]))
+                        num_output_top_logprobs=request.logprobs,
                        initial_text_offset=offsets,
                    )
                else:
                    logprobs = None
                previous_texts[i] = output.text
                previous_num_tokens[i] = len(output.token_ids)
                finish_reason = output.finish_reason
                response_json = create_stream_response_json(
                    index=i,
                    text=delta_text,
                    logprobs=logprobs,
                    finish_reason=finish_reason,
                )
                yield f"data: {response_json}\n\n"
                if output.finish_reason is not None:
                    logprobs = (LogProbs()
                                if request.logprobs is not None else None)
                    prompt_tokens = len(res.prompt_token_ids)
                    completion_tokens = len(output.token_ids)
                    final_usage = UsageInfo(
                        prompt_tokens=prompt_tokens,
                        completion_tokens=completion_tokens,
                        total_tokens=prompt_tokens + completion_tokens,
                    )
                    response_json = create_stream_response_json(
                        index=i,
                        text="",
                        logprobs=logprobs,
                        finish_reason=output.finish_reason,
                        usage=final_usage,
                    )
                    yield f"data: {response_json}\n\n"
        yield "data: [DONE]\n\n"
@@ -520,14 +631,36 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
        final_res = res
    assert final_res is not None
    choices = []
    prompt_token_ids = final_res.prompt_token_ids
    prompt_logprobs = final_res.prompt_logprobs
    prompt_text = final_res.prompt
    for output in final_res.outputs:
        if request.logprobs is not None:
-            logprobs = create_logprobs(output.token_ids, output.logprobs)
+            if not echo_without_generation:
                token_ids = output.token_ids
                top_logprobs = output.logprobs
                if request.echo:
                    token_ids = prompt_token_ids + token_ids
                    top_logprobs = prompt_logprobs + top_logprobs
            else:
                token_ids = prompt_token_ids
                top_logprobs = prompt_logprobs
            logprobs = create_logprobs(
                token_ids=token_ids,
                top_logprobs=top_logprobs,
                num_output_top_logprobs=request.logprobs,
            )
        else:
            logprobs = None
        if not echo_without_generation:
            output_text = output.text
            if request.echo:
                output_text = prompt_text + output_text
        else:
            output_text = prompt_text
        choice_data = CompletionResponseChoice(
            index=output.index,
-            text=output.text,
+            text=output_text,
            logprobs=logprobs,
            finish_reason=output.finish_reason,
        )
@@ -565,37 +698,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
+    args = parse_args()
        description="vLLM OpenAI-Compatible RESTful API server.")
    parser.add_argument("--host",
                        type=str,
                        default="localhost",
                        help="host name")
    parser.add_argument("--port", type=int, default=8000, help="port number")
    parser.add_argument("--allow-credentials",
                        action="store_true",
                        help="allow credentials")
    parser.add_argument("--allowed-origins",
                        type=json.loads,
                        default=["*"],
                        help="allowed origins")
    parser.add_argument("--allowed-methods",
                        type=json.loads,
                        default=["*"],
                        help="allowed methods")
    parser.add_argument("--allowed-headers",
                        type=json.loads,
                        default=["*"],
                        help="allowed headers")
    parser.add_argument("--served-model-name",
                        type=str,
                        default=None,
                        help="The model name used in the API. If not "
                        "specified, the model name will be the same as "
                        "the huggingface name.")
    parser = AsyncEngineArgs.add_cli_args(parser)
    args = parser.parse_args()
    app.add_middleware(
        CORSMiddleware,
@@ -612,15 +715,22 @@ if __name__ == "__main__":
    else:
        served_model = args.model
    response_role = args.response_role
    engine_args = AsyncEngineArgs.from_cli_args(args)
    engine = AsyncLLMEngine.from_engine_args(engine_args)
    engine_model_config = asyncio.run(engine.get_model_config())
    max_model_len = engine_model_config.max_model_len
    # A separate tokenizer to map token IDs to strings.
-    tokenizer = get_tokenizer(engine_args.tokenizer,
+    tokenizer = get_tokenizer(
-                              tokenizer_mode=engine_args.tokenizer_mode,
+        engine_model_config.tokenizer,
-                              trust_remote_code=engine_args.trust_remote_code)
+        tokenizer_mode=engine_model_config.tokenizer_mode,
        trust_remote_code=engine_model_config.trust_remote_code)
    load_chat_template(args, tokenizer)
    # Register labels for metrics
    add_global_metrics_labels(model_name=engine_args.model)
    uvicorn.run(app,
                host=args.host,
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -72,6 +72,9 @@ class ChatCompletionRequest(BaseModel):
    use_beam_search: Optional[bool] = False
    stop_token_ids: Optional[List[int]] = Field(default_factory=list)
    skip_special_tokens: Optional[bool] = True
    spaces_between_special_tokens: Optional[bool] = True
    add_generation_prompt: Optional[bool] = True
    echo: Optional[bool] = False
 class CompletionRequest(BaseModel):
@@ -98,14 +101,14 @@ class CompletionRequest(BaseModel):
    use_beam_search: Optional[bool] = False
    stop_token_ids: Optional[List[int]] = Field(default_factory=list)
    skip_special_tokens: Optional[bool] = True
    spaces_between_special_tokens: Optional[bool] = True
 class LogProbs(BaseModel):
    text_offset: List[int] = Field(default_factory=list)
    token_logprobs: List[Optional[float]] = Field(default_factory=list)
    tokens: List[str] = Field(default_factory=list)
-    top_logprobs: List[Optional[Dict[str,
+    top_logprobs: Optional[List[Optional[Dict[int, float]]]] = None
                                     float]]] = Field(default_factory=list)
 class CompletionResponseChoice(BaseModel):
@@ -137,6 +140,7 @@ class CompletionStreamResponse(BaseModel):
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: List[CompletionResponseStreamChoice]
    usage: Optional[UsageInfo]
 class ChatMessage(BaseModel):
@@ -176,3 +180,5 @@ class ChatCompletionStreamResponse(BaseModel):
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: List[ChatCompletionResponseStreamChoice]
    usage: Optional[UsageInfo] = Field(
        default=None, description="data about request and response")
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -48,4 +48,9 @@ _setup_logger()
 def init_logger(name: str):
-    return logging.getLogger(name)
+    # Use the same settings as above for root logger
    logger = logging.getLogger(name)
    logger.setLevel(logging.DEBUG)
    logger.addHandler(_default_handler)
    logger.propagate = False
    return logger
--- a/vllm/model_executor/init.py
+++ b/vllm/model_executor/init.py
@@ -1,9 +1,11 @@
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_random_seed
 __all__ = [
    "InputMetadata",
    "get_model",
    "SamplingMetadata",
    "set_random_seed",
 ]
--- a/vllm/model_executor/input_metadata.py
+++ b/vllm/model_executor/input_metadata.py
@@ -1,86 +1,42 @@
-from typing import Dict, List, Optional, Tuple
+from typing import List, Optional
 import torch
 from xformers.ops import AttentionBias
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import SequenceData
 class InputMetadata:
-    """Metadata for input sequences. Used for PagedAttention.
+    """Metadata for input sequences. Used in PagedAttention.
    Args:
        seq_groups: List of (seq_ids, sampling_params).
        seq_data: Seq_id -> SequenceData.
        prompt_lens: Lengths of prompts.
        slot_mapping: The address to write the new KV to of each token.
        context_lens: the length of attention context for each generation token.
        max_context_len: The maximum context length.
        context_lens: the length of attention context for each sequence.
        block_tables: The block tables. (Seq id -> list of physical block)
    """
    def __init__(
        self,
        seq_groups: List[Tuple[List[int], SamplingParams]],
        seq_data: Dict[int, SequenceData],
        prompt_lens: List[int],
        slot_mapping: torch.Tensor,
-        context_lens: torch.Tensor,
+        max_context_len: Optional[int],
-        max_context_len: int,
+        context_lens: Optional[torch.Tensor],
-        block_tables: torch.Tensor,
+        block_tables: Optional[torch.Tensor],
        sliding_window: Optional[int] = None,
    ) -> None:
        self.seq_groups = seq_groups
        self.seq_data = seq_data
        self.prompt_lens = prompt_lens
        self.max_context_len = max_context_len
        self.slot_mapping = slot_mapping
        self.context_lens = context_lens
        self.max_context_len = max_context_len
        self.block_tables = block_tables
-        self.to_cache = None
+        self.is_prompt = len(prompt_lens) > 0
        if sliding_window is not None:
            # We need to keep the positions of sliding windows within
            # the key / value tables, this is helpful to know which
            # elements we need to cache and where
            to_cache, start_idx = [], 0
            for prompt_len in self.prompt_lens:
                to_cache.extend(
                    range(
                        start_idx + max(0, prompt_len - sliding_window),
                        start_idx + prompt_len,
                    ))
                start_idx += prompt_len
            to_cache.extend(range(start_idx, slot_mapping.shape[0]))
            self.to_cache = torch.tensor(to_cache,
                                         dtype=torch.int32,
                                         device=self.slot_mapping.device)
        self.num_prompts = len(prompt_lens)
        self.num_prompt_tokens = sum(prompt_lens)
        self.num_generation_tokens = context_lens.shape[0]
        self.num_valid_tokens = slot_mapping.shape[0]
        if block_tables.numel() > 0:
            self.max_num_blocks_per_seq = block_tables.shape[1]
        else:
            self.max_num_blocks_per_seq = 0
        assert block_tables.shape[0] == self.num_generation_tokens
        assert context_lens.shape[0] == self.num_generation_tokens
        # Set during the execution of the first attention op.
-        self.attn_bias: List[AttentionBias] = []
+        # FIXME(woosuk): This is a hack.
        self.attn_bias = None
    def __repr__(self) -> str:
-        # Print only useful metadata.
+        return ("InputMetadata("
-        return (f'InputMetadata('
+                f"prompt_lens={self.prompt_lens}, "
-                f'num_valid_tokens={self.num_valid_tokens}, '
+                f"max_context_len={self.max_context_len}, "
-                f'num_prompt_tokens={self.num_prompt_tokens}, '
+                f"slot_mapping={self.slot_mapping}, "
-                f'num_prompts={self.num_prompts}, '
+                f"context_lens={self.context_lens}, "
-                f'prompt_lens={self.prompt_lens}, '
+                f"block_tables={self.block_tables})")
                f'num_generation_tokens={self.num_generation_tokens}, '
                f'context_lens={self.context_lens}, '
                f'max_context_len={self.max_context_len}), '
                f'max_num_blocks_per_seq={self.max_num_blocks_per_seq}, '
                f'block_tables={self.block_tables}), '
                f'slot_mapping={self.slot_mapping}')
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -1,48 +1,113 @@
 """Custom activation functions."""
 import math
 from typing import Optional
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from vllm import activation_ops
+from vllm._C import ops
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.parallel_utils.parallel_state import (
    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
 from vllm.model_executor.parallel_utils.utils import divide
 from vllm.model_executor.utils import set_weight_attrs
 class SiluAndMul(nn.Module):
    """An activation function for SwiGLU.
-    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[1] // 2.
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
    Shapes:
-        x: (num_tokens, 2 * d)
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
-        return: (num_tokens, d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
    """
    def _forward(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        d = x.shape[-1] // 2
        return F.silu(x[..., :d]) * x[..., d:]
    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        num_tokens = x.shape[0]
+        d = x.shape[-1] // 2
-        d = x.shape[1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
-        out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        activation_ops.silu_and_mul(out, x)
+        ops.silu_and_mul(out, x)
        return out
 class NewGELU(nn.Module):
    def _forward(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        c = math.sqrt(2.0 / math.pi)
        return 0.5 * x * (1.0 + torch.tanh(c *
                                           (x + 0.044715 * torch.pow(x, 3.0))))
    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        num_tokens = x.shape[0]
+        out = torch.empty_like(x)
-        d = x.shape[1]
+        ops.gelu_new(out, x)
        out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device)
        activation_ops.gelu_new(out, x)
        return out
 class FastGELU(nn.Module):
    def _forward(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 *
                                           (1.0 + 0.044715 * x * x)))
    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        num_tokens = x.shape[0]
+        out = torch.empty_like(x)
-        d = x.shape[1]
+        ops.gelu_fast(out, x)
        out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device)
        activation_ops.gelu_fast(out, x)
        return out
 class ScaledActivation(nn.Module):
    """An activation function with post-scale parameters.
    This is used for some quantization methods like AWQ.
    """
    def __init__(
        self,
        act_module: nn.Module,
        intermediate_size: int,
        input_is_parallel: bool = True,
        params_dtype: Optional[torch.dtype] = None,
    ):
        super().__init__()
        self.act = act_module
        self.input_is_parallel = input_is_parallel
        if input_is_parallel:
            tp_size = get_tensor_model_parallel_world_size()
            intermediate_size_per_partition = divide(intermediate_size,
                                                     tp_size)
        else:
            intermediate_size_per_partition = intermediate_size
        if params_dtype is None:
            params_dtype = torch.get_default_dtype()
        self.scales = nn.Parameter(
            torch.empty(intermediate_size_per_partition,
                        dtype=params_dtype,
                        device="cuda"))
        set_weight_attrs(self.scales, {"weight_loader": self.weight_loader})
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.act(x) / self.scales
    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
        param_data = param.data
        if self.input_is_parallel:
            tp_rank = get_tensor_model_parallel_rank()
            shard_size = param_data.shape[0]
            start_idx = tp_rank * shard_size
            loaded_weight = loaded_weight.narrow(0, start_idx, shard_size)
        assert param_data.shape == loaded_weight.shape
        param_data.copy_(loaded_weight)
 _ACTIVATION_REGISTRY = {
    "gelu": nn.GELU(),
    "gelu_fast": FastGELU(),
@@ -52,9 +117,25 @@ _ACTIVATION_REGISTRY = {
 }
-def get_act_fn(act_fn: str) -> nn.Module:
+def get_act_fn(
    act_fn_name: str,
    quant_config: Optional[QuantizationConfig] = None,
    intermediate_size: Optional[int] = None,
    input_is_parallel: bool = True,
    params_dtype: Optional[torch.dtype] = None,
 ) -> nn.Module:
    """Get an activation function by name."""
-    act_fn = act_fn.lower()
+    act_fn_name = act_fn_name.lower()
-    if act_fn in _ACTIVATION_REGISTRY:
+    if act_fn_name not in _ACTIVATION_REGISTRY:
-        return _ACTIVATION_REGISTRY[act_fn]
+        raise ValueError(
-    raise ValueError(f"Activation function {act_fn!r} is not supported.")
+            f"Activation function {act_fn_name!r} is not supported.")
    act_fn = _ACTIVATION_REGISTRY[act_fn_name]
    if (quant_config is not None
            and act_fn_name in quant_config.get_scaled_act_names()):
        if intermediate_size is None:
            raise ValueError("intermediate_size must be specified for scaled "
                             "activation functions.")
        return ScaledActivation(act_fn, intermediate_size, input_is_parallel,
                                params_dtype)
    return act_fn
--- a/vllm/model_executor/layers/attention.py
+++ b/vllm/model_executor/layers/attention.py
@@ -1,5 +1,5 @@
 """Multi-head attention."""
-from typing import Any, Dict, List, Optional
+from typing import List, Optional
 import torch
 import torch.nn as nn
@@ -7,65 +7,49 @@ from xformers import ops as xops
 from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask,
                                         LowerTriangularMaskWithTensorBias)
-from vllm import attention_ops
+from vllm._C import ops
-from vllm import cache_ops
+from vllm._C import cache_ops
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.rotary_embedding import (
    DynamicNTKScalingRotaryEmbedding, LinearScalingRotaryEmbedding,
    RotaryEmbedding)
 _SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
 _PARTITION_SIZE = 512
 class PagedAttention(nn.Module):
-    # pylint: disable=line-too-long
+    """MHA/MQA/GQA layer with PagedAttention.
    """GPT-style multi-head PagedAttention.
    This class takes flattened 1D query, key, and value tensors as input. The
    input 1D tensors can either contain prompt tokens or generation tokens, in
    addition to paddings.
    If the input tensors contain prompt tokens, the layout is as follows:
    |<---------------------- num_valid_tokens ---------------------->|
    |<--------------- num_prompt_tokens -------------->|
    |<--prompt_0-->|<--prompt_1-->|...|<--prompt_N-1-->|<--padding-->|
    Otherwise, the layout is as follows:
    |<------------------ num_valid_tokens ------------------->|
    |<------- num_generation_tokens (M) ------->|
    |<--generation_0-->|...|<--generation_M-1-->|<--padding-->|
    The prompts might have different lengths, while the generation tokens always
    have length 1. The paddings are appended to make the input length a multiple
    of 8, which is desirable for Tensor Cores.
    This class takes query, key, and value tensors as input. The input tensors
    can either contain prompt tokens or generation tokens.
    The class does the following:
-    1. Perform multi_query_kv_attention for the prompts. This operation does
+
-        not use the KV cache.
+    1. Wait for the cache operations (e.g., swap, copy) to finish. The cache
    2. Wait for the cache operations (e.g., swap, copy) to finish. The cache
        operations are issued by the cache engine before executing the forward
        pass of the model, and they are executed asynchronously.
-    3. Reshape and store the input key and value tensors in the KV cache.
+    2. Reshape and store the input key and value tensors in the KV cache.
-    4. Perform single_query_cached_kv_attention for the generation tokens.
+    3. Perform (multi-head/multi-query/grouped-query) attention using either
-        This operation reads the previous key and value tensors from the KV
+        xformers or the PagedAttention custom op.
-        cache.
+    4. Return the output tensor.
    5. Output a flattened 1D tensor.
    """
-    def __init__(self,
+    def __init__(
-                 num_heads: int,
+        self,
-                 head_size: int,
+        num_heads: int,
-                 scale: float,
+        head_size: int,
-                 num_kv_heads: Optional[int] = None,
+        scale: float,
-                 sliding_window: Optional[int] = None) -> None:
+        num_kv_heads: Optional[int] = None,
        alibi_slopes: Optional[List[float]] = None,
        sliding_window: Optional[int] = None,
    ) -> None:
        super().__init__()
        self.num_heads = num_heads
        self.head_size = head_size
        self.scale = float(scale)
        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
        self.sliding_window = sliding_window
        if alibi_slopes is not None:
            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
        self.register_buffer("alibi_slopes", alibi_slopes, persistent=False)
        assert self.num_heads % self.num_kv_heads == 0
        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
@@ -77,93 +61,6 @@ class PagedAttention(nn.Module):
            raise ValueError(f"head_size ({self.head_size}) is not supported. "
                             f"Supported head sizes: {_SUPPORTED_HEAD_SIZES}.")
    def set_attn_bias(
        self,
        input_metadata: InputMetadata,
        dtype: torch.dtype,
    ) -> None:
        del dtype  # Unused.
        if input_metadata.attn_bias:
            # Already set by a previous layer.
            return
        prompt_lens = input_metadata.prompt_lens
        attn_bias = BlockDiagonalCausalMask.from_seqlens(prompt_lens)
        if self.sliding_window is not None:
            attn_bias = attn_bias.make_local_attention(self.sliding_window)
        input_metadata.attn_bias.append(attn_bias)
    def multi_query_kv_attention(
        self,
        output: torch.Tensor,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        input_metadata: InputMetadata,
    ) -> torch.Tensor:
        """Normal attention for the prompt tokens.
        Args:
            output: shape = [num_prompt_tokens, num_heads, head_size]
            query: shape = [num_prompt_tokens, num_heads, head_size]
            key: shape = [num_prompt_tokens, num_kv_heads, head_size]
            value: shape = [num_prompt_tokens, num_kv_heads, head_size]
            input_metadata: metadata for paged attention.
        """
        if self.num_kv_heads != self.num_heads:
            # Project the key and value tensors to the desired number of heads.
            key = torch.repeat_interleave(key, self.num_queries_per_kv, dim=1)
            value = torch.repeat_interleave(value,
                                            self.num_queries_per_kv,
                                            dim=1)
        # TODO(woosuk): The unsqueeze op may incur some CPU overhead. Optimize.
        out = xops.memory_efficient_attention_forward(
            query.unsqueeze(0),
            key.unsqueeze(0),
            value.unsqueeze(0),
            attn_bias=input_metadata.attn_bias[0],
            p=0.0,
            scale=self.scale,
        )
        # TODO(woosuk): Unnecessary copy. Optimize.
        output.copy_(out.squeeze(0))
        return output
    def single_query_cached_kv_attention(
        self,
        output: torch.Tensor,
        query: torch.Tensor,
        key_cache: torch.Tensor,
        value_cache: torch.Tensor,
        input_metadata: InputMetadata,
    ) -> None:
        """PagedAttention for the generation tokens.
        Args:
            output: shape = [num_generation_tokens, num_heads, head_size]
            query: shape = [num_generation_tokens, num_heads, head_size]
            key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
                block_size, x]
            value_cache: shape = [num_blocks, num_kv_heads, head_size,
                block_size]
            input_metadata: metadata for paged attention.
        """
        block_size = value_cache.shape[3]
        attention_ops.single_query_cached_kv_attention(
            output,
            query,
            key_cache,
            value_cache,
            self.head_mapping,
            self.scale,
            input_metadata.block_tables,
            input_metadata.context_lens,
            block_size,
            input_metadata.max_context_len,
            None,  # alibi_slopes
        )
    def forward(
        self,
        query: torch.Tensor,
@@ -176,291 +73,210 @@ class PagedAttention(nn.Module):
    ) -> torch.Tensor:
        """PagedAttention forward pass.
        NOTE: The query, key, and value tensors must be sliced from a qkv
        tensor of shape [num_tokens, 3 * num_heads * head_size].
        Args:
-            query: shape = [num_tokens, num_heads * head_size]
+            query: shape = [batch_size, seq_len, num_heads * head_size]
-            key: shape = [num_tokens, num_kv_heads * head_size]
+            key: shape = [batch_size, seq_len, num_kv_heads * head_size]
-            value: shape = [num_tokens, num_kv_heads * head_size]
+            value: shape = [batch_size, num_kv_heads * head_size]
            key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
                block_size, x]
            value_cache: shape = [num_blocks, num_kv_heads, head_size,
                block_size]
-            input_metadata: metadata for paged attention.
+            input_metadata: metadata for the inputs.
            cache_event: event to wait for the cache operations to finish.
        Returns:
-            shape = [num_tokens, num_heads * head_size]
+            shape = [batch_size, seq_len, num_heads * head_size]
        """
-
+        batch_size, seq_len, hidden_size = query.shape
        # Reshape the query, key, and value tensors.
        query = query.view(-1, self.num_heads, self.head_size)
        key = key.view(-1, self.num_kv_heads, self.head_size)
        value = value.view(-1, self.num_kv_heads, self.head_size)
        slot_mapping = input_metadata.slot_mapping.flatten()
        # Pre-allocate the output tensor.
        output = torch.empty_like(query)
        # Compute the attention op for prompts.
        num_prompt_tokens = input_metadata.num_prompt_tokens
        if num_prompt_tokens > 0:
            # Prompt run.
            assert input_metadata.num_generation_tokens == 0
            self.set_attn_bias(input_metadata, dtype=query.dtype)
            self.multi_query_kv_attention(
                output[:num_prompt_tokens],
                query[:num_prompt_tokens],
                key[:num_prompt_tokens],
                value[:num_prompt_tokens],
                input_metadata,
            )
        # Wait until the cache op is done.
        if cache_event is not None:
            cache_event.wait()
        # Reshape the keys and values and store them in the cache.
-        # When key_cache and value_cache are not provided, the new key
+        # If key_cache and value_cache are not provided, the new key and value
-        # and value vectors will not be cached.
+        # vectors will not be cached. This happens during the initial memory
-        num_valid_tokens = input_metadata.num_valid_tokens
+        # profiling run.
-        if (num_valid_tokens > 0 and key_cache is not None
+        if key_cache is not None and value_cache is not None:
                and value_cache is not None):
            # The stride is 3 because the key and value are sliced from qkv.
            key_to_cache = key[:num_valid_tokens]
            value_to_cache = value[:num_valid_tokens]
            slot_mapping = input_metadata.slot_mapping
            if input_metadata.to_cache is not None:
                key_to_cache = key_to_cache[input_metadata.to_cache]
                value_to_cache = value_to_cache[input_metadata.to_cache]
                slot_mapping = slot_mapping[input_metadata.to_cache]
            cache_ops.reshape_and_cache(
-                key_to_cache,
+                key,
-                value_to_cache,
+                value,
                key_cache,
                value_cache,
                slot_mapping,
            )
-        if input_metadata.num_generation_tokens > 0:
+        if input_metadata.is_prompt:
-            # Decoding run.
+            # Prompt run.
-            assert input_metadata.num_prompt_tokens == 0
+            if self.num_kv_heads != self.num_heads:
-            assert key_cache is not None and value_cache is not None, (
+                # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
-                "key_cache and value_cache must be provided when "
+                # project the key and value tensors to the desired number of
-                "generating tokens.")
+                # heads.
-            # Compute the attention op for generation tokens.
+                # TODO(woosuk): Use MQA/GQA kernels for higher performance.
-            self.single_query_cached_kv_attention(
+                query = query.view(query.shape[0], self.num_kv_heads,
-                output[num_prompt_tokens:num_valid_tokens],
+                                   self.num_queries_per_kv, query.shape[-1])
-                query[num_prompt_tokens:num_valid_tokens], key_cache,
+                key = key[:, :,
-                value_cache, input_metadata)
+                          None, :].expand(key.shape[0], self.num_kv_heads,
                                          self.num_queries_per_kv,
                                          key.shape[-1])
                value = value[:, :, None, :].expand(value.shape[0],
                                                    self.num_kv_heads,
                                                    self.num_queries_per_kv,
                                                    value.shape[-1])
-        # Reshape the output tensor.
+            # Set attention bias if not provided. This typically happens at the
-        # NOTE(woosuk): The output tensor may include paddings.
+            # very attention layer of every iteration.
-        return output.view(-1, self.num_heads * self.head_size)
+            # FIXME(woosuk): This is a hack.
            if input_metadata.attn_bias is None:
                if self.alibi_slopes is None:
                    attn_bias = BlockDiagonalCausalMask.from_seqlens(
                        [seq_len] * batch_size)
                    if self.sliding_window is not None:
                        attn_bias = attn_bias.make_local_attention(
                            self.sliding_window)
                    input_metadata.attn_bias = attn_bias
                else:
                    input_metadata.attn_bias = _make_alibi_bias(
                        self.alibi_slopes, batch_size, seq_len, query.dtype)
-
+            # TODO(woosuk): Too many view operations. Let's try to reduce them
-class PagedAttentionWithRoPE(PagedAttention):
+            # in the future for code readability.
-    """PagedAttention with rotary positional embedding."""
+            if self.alibi_slopes is None:
-
+                query = query.unsqueeze(0)
-    def __init__(
+                key = key.unsqueeze(0)
-        self,
+                value = value.unsqueeze(0)
        num_heads: int,
        head_size: int,
        scale: float,
        rotary_dim: int,
        max_position: int = 8192,
        base: int = 10000,
        num_kv_heads: Optional[int] = None,
        is_neox_style: bool = True,
        rope_scaling: Optional[Dict[str, Any]] = None,
        sliding_window: Optional[int] = None,
    ) -> None:
        super().__init__(num_heads,
                         head_size,
                         scale,
                         num_kv_heads,
                         sliding_window=sliding_window)
        if rope_scaling is None:
            self.rotary_emb = RotaryEmbedding(head_size, rotary_dim,
                                              max_position, base,
                                              is_neox_style)
        else:
            scaling_type = rope_scaling["type"]
            scaling_factor = rope_scaling["factor"]
            if scaling_type == "linear":
                self.rotary_emb = LinearScalingRotaryEmbedding(
                    head_size, rotary_dim, max_position, base, is_neox_style,
                    scaling_factor)
            elif scaling_type == "dynamic":
                self.rotary_emb = DynamicNTKScalingRotaryEmbedding(
                    head_size, rotary_dim, max_position, base, is_neox_style,
                    scaling_factor)
            else:
-                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+                query = query.unflatten(0, (batch_size, seq_len))
                key = key.unflatten(0, (batch_size, seq_len))
                value = value.unflatten(0, (batch_size, seq_len))
    def forward(
        self,
        positions: torch.Tensor,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        key_cache: torch.Tensor,
        value_cache: torch.Tensor,
        input_metadata: InputMetadata,
        cache_event: Optional[torch.cuda.Event],
    ) -> torch.Tensor:
        """ PagedAttention forward pass with rotary embedding.
        Args:
            positions: shape = [num_tokens]
            query: shape = [num_tokens, num_heads * head_size]
            key: shape = [num_tokens, num_kv_heads * head_size]
            value: shape = [num_tokens, num_kv_heads * head_size]
            key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
                block_size, x]
            value_cache: shape = [num_blocks, num_kv_heads, head_size,
                block_size]
            input_metadata: metadata for paged attention.
            cache_event: event to wait for the cache operations to finish.
        Returns:
            shape = [num_tokens, num_heads * head_size]
        """
        # Apply rotary embedding to the query and key before passing them
        # to the attention op.
        query, key = self.rotary_emb(positions, query, key)
        return super().forward(
            query,
            key,
            value,
            key_cache,
            value_cache,
            input_metadata,
            cache_event,
        )
 class PagedAttentionWithALiBi(PagedAttention):
    """PagedAttention with ALiBi attention bias."""
    def __init__(self,
                 num_heads: int,
                 head_size: int,
                 scale: float,
                 slopes: List[float],
                 num_kv_heads: Optional[int] = None) -> None:
        super().__init__(num_heads, head_size, scale, num_kv_heads)
        assert len(slopes) == num_heads
        slopes = torch.tensor(slopes, dtype=torch.float32)
        self.register_buffer("alibi_slopes", slopes, persistent=False)
    def set_attn_bias(self, input_metadata: InputMetadata,
                      dtype: torch.dtype) -> None:
        if input_metadata.attn_bias:
            # Already set by a previous layer.
            return
        # Generates ALiBi mask for each prompt.
        for prompt_len in input_metadata.prompt_lens:
            bias = torch.arange(prompt_len, dtype=dtype)
            # Note(zhuohan): HF uses
            #     `bias = bias[None, :].repeat(prompt_len, 1)`
            # here. We find that both biases give the same results, but
            # the bias below more accurately follows the original ALiBi
            # paper.
            bias = bias[None, :] - bias[:, None]
            bias = bias.to(self.alibi_slopes.device)
            # When using custom attention bias, xformers requires the bias to
            # be sliced from a tensor whose length is a multiple of 8.
            padded_len = (prompt_len + 7) // 8 * 8
            bias = torch.empty(
                1,  # batch_size
                self.num_heads,
                prompt_len,
                padded_len,
                device=self.alibi_slopes.device,
                dtype=dtype,
            )[:, :, :, :prompt_len].copy_(bias)
            bias.mul_(self.alibi_slopes[:, None, None])
            attn_bias = LowerTriangularMaskWithTensorBias(bias)
            input_metadata.attn_bias.append(attn_bias)
    def multi_query_kv_attention(
        self,
        output: torch.Tensor,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        input_metadata: InputMetadata,
    ) -> torch.Tensor:
        """Attention with ALiBi bias for the prompt tokens.
        Args:
            output: shape = [num_prompt_tokens, num_heads, head_size]
            query: shape = [num_prompt_tokens, num_heads, head_size]
            key: shape = [num_prompt_tokens, num_kv_heads, head_size]
            value: shape = [num_prompt_tokens, num_kv_heads, head_size]
            input_metadata: metadata for paged attention.
        """
        if self.num_kv_heads != self.num_heads:
            # Project the key and value tensors to the desired number of heads.
            key = torch.repeat_interleave(key, self.num_queries_per_kv, dim=1)
            value = torch.repeat_interleave(value,
                                            self.num_queries_per_kv,
                                            dim=1)
        # FIXME(woosuk): Because xformers does not support dynamic sequence
        # lengths with custom attention bias, we process each prompt one by
        # one. This is inefficient, especially when we have many short prompts.
        start = 0
        for i, prompt_len in enumerate(input_metadata.prompt_lens):
            end = start + prompt_len
            out = xops.memory_efficient_attention_forward(
-                query[None, start:end],
+                query,
-                key[None, start:end],
+                key,
-                value[None, start:end],
+                value,
-                attn_bias=input_metadata.attn_bias[i],
+                attn_bias=input_metadata.attn_bias,
                p=0.0,
                scale=self.scale,
            )
-            # TODO(woosuk): Unnecessary copy. Optimize.
+            output = out.view_as(query)
-            output[start:end].copy_(out.squeeze(0))
+        else:
-            start += prompt_len
+            # Decoding run.
-        return output
+            output = _paged_attention(
                query,
                key_cache,
                value_cache,
                input_metadata,
                self.head_mapping,
                self.scale,
                self.alibi_slopes,
            )
-    def single_query_cached_kv_attention(
+        # Reshape the output tensor.
-        self,
+        return output.view(batch_size, seq_len, hidden_size)
        output: torch.Tensor,
        query: torch.Tensor,
        key_cache: torch.Tensor,
        value_cache: torch.Tensor,
        input_metadata: InputMetadata,
    ) -> None:
        """PagedAttention with ALiBi bias for the generation tokens.
-        Args:
+
-            output: shape = [num_generation_tokens, num_heads, head_size]
+def _make_alibi_bias(
-            query: shape = [num_generation_tokens, num_heads, head_size]
+    alibi_slopes: torch.Tensor,
-            key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
+    batch_size: int,
-                block_size, x]
+    seq_len: int,
-            value_cache: shape = [num_blocks, num_kv_heads, head_size,
+    dtype: torch.dtype,
-                block_size]
+) -> LowerTriangularMaskWithTensorBias:
-            input_metadata: metadata for paged attention.
+    bias = torch.arange(seq_len, dtype=dtype)
-        """
+    # NOTE(zhuohan): HF uses
-        block_size = value_cache.shape[3]
+    #     `bias = bias[None, :].repeat(prompt_len, 1)`
-        attention_ops.single_query_cached_kv_attention(
+    # here. We find that both biases give the same results, but
    # the bias below more accurately follows the original ALiBi
    # paper.
    bias = bias[None, :] - bias[:, None]
    bias = bias.to(alibi_slopes.device)
    # When using custom attention bias, xformers requires the bias to
    # be sliced from a tensor whose length is a multiple of 8.
    padded_len = (seq_len + 7) // 8 * 8
    bias = torch.empty(
        batch_size,
        alibi_slopes.shape[0],
        seq_len,
        padded_len,
        device=alibi_slopes.device,
        dtype=dtype,
    )[:, :, :, :seq_len].copy_(bias)
    bias.mul_(alibi_slopes[:, None, None])
    attn_bias = LowerTriangularMaskWithTensorBias(bias)
    return attn_bias
 def _paged_attention(
    query: torch.Tensor,
    key_cache: torch.Tensor,
    value_cache: torch.Tensor,
    input_metadata: InputMetadata,
    head_mapping: torch.Tensor,
    scale: float,
    alibi_slopes: Optional[torch.Tensor],
 ) -> torch.Tensor:
    output = torch.empty_like(query)
    block_size = value_cache.shape[3]
    num_seqs, num_heads, head_size = query.shape
    max_num_partitions = (
        (input_metadata.max_context_len + _PARTITION_SIZE - 1) //
        _PARTITION_SIZE)
    # NOTE(woosuk): We use a simple heuristic to decide whether to use
    # PagedAttention V1 or V2. If the number of partitions is 1, we use
    # V1 to avoid the overhead of reduction. Also, if the number of
    # sequences or heads is large, we use V1 since there is enough work
    # to parallelize.
    # TODO(woosuk): Tune this heuristic.
    # For context len > 8192, use V2 kernel to avoid shared memory shortage.
    use_v1 = input_metadata.max_context_len <= 8192 and (
        max_num_partitions == 1 or num_seqs * num_heads > 512)
    if use_v1:
        # Run PagedAttention V1.
        ops.paged_attention_v1(
            output,
            query,
            key_cache,
            value_cache,
-            self.head_mapping,
+            head_mapping,
-            self.scale,
+            scale,
            input_metadata.block_tables,
            input_metadata.context_lens,
            block_size,
            input_metadata.max_context_len,
-            self.alibi_slopes,
+            alibi_slopes,
        )
    else:
        # Run PagedAttention V2.
        assert _PARTITION_SIZE % block_size == 0
        tmp_output = torch.empty(
            size=(num_seqs, num_heads, max_num_partitions, head_size),
            dtype=output.dtype,
            device=output.device,
        )
        exp_sums = torch.empty(
            size=(num_seqs, num_heads, max_num_partitions),
            dtype=torch.float32,
            device=output.device,
        )
        max_logits = torch.empty_like(exp_sums)
        ops.paged_attention_v2(
            output,
            exp_sums,
            max_logits,
            tmp_output,
            query,
            key_cache,
            value_cache,
            head_mapping,
            scale,
            input_metadata.block_tables,
            input_metadata.context_lens,
            block_size,
            input_metadata.max_context_len,
            alibi_slopes,
        )
    return output
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -1,8 +1,10 @@
 """Custom normalization layers."""
 from typing import Optional, Tuple, Union
 import torch
 import torch.nn as nn
-from vllm import layernorm_ops
+from vllm._C import ops
 class RMSNorm(nn.Module):
@@ -21,9 +23,41 @@ class RMSNorm(nn.Module):
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def _forward(
        self,
        x: torch.Tensor,
        residual: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """PyTorch-native implementation equivalent to forward()."""
        orig_dtype = x.dtype
        x = x.to(torch.float32)
        if residual is not None:
            x = x + residual.to(torch.float32)
            residual = x.to(orig_dtype)
        variance = x.pow(2).mean(dim=-1, keepdim=True)
        x = x * torch.rsqrt(variance + self.variance_epsilon)
        x = x.to(orig_dtype) * self.weight
        if residual is None:
            return x
        else:
            return x, residual
    def forward(
        self,
        x: torch.Tensor,
        residual: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        if residual is not None:
            ops.fused_add_rms_norm(
                x,
                residual,
                self.weight.data,
                self.variance_epsilon,
            )
            return x, residual
        out = torch.empty_like(x)
-        layernorm_ops.rms_norm(
+        ops.rms_norm(
            out,
            x,
            self.weight.data,
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Woosuk Kwon	0f90effc66	Bump up to v0.2.3 (#1903 ) Some checks failed Create Release / Create Release (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.10, 2.1.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.11, 2.1.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.8, 2.1.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.9, 2.1.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.10, 2.1.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.11, 2.1.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.8, 2.1.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.9, 2.1.0) (push) Has been cancelled Details	2023-12-03 12:27:47 -08:00
Woosuk Kwon	464dd985e3	Fix num_gpus when TP > 1 (#1852 )	2023-12-03 12:24:30 -08:00
Massimiliano Pronesti	c07a442854	chore(examples-docs): upgrade to OpenAI V1 (#1785 )	2023-12-03 01:11:22 -08:00
Woosuk Kwon	cd3aa153a4	Fix broken worker test (#1900 )	2023-12-02 22:17:33 -08:00
Woosuk Kwon	9b294976a2	Add PyTorch-native implementation of custom layers (#1898 )	2023-12-02 21:18:40 -08:00
Simon Mo	5313c2cb8b	Add Production Metrics in Prometheus format (#1890 )	2023-12-02 16:37:44 -08:00
Woosuk Kwon	5f09cbdb63	Fix broken sampler tests (#1896 ) Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>	2023-12-02 16:06:17 -08:00
Simon Mo	4cefa9b49b	[Docs] Update the AWQ documentation to highlight performance issue (#1883 )	2023-12-02 15:52:47 -08:00
Jerry	f86bd6190a	Fix the typo in SamplingParams' docstring (#1886 )	2023-12-01 02:06:36 -08:00
Woosuk Kwon	e5452ddfd6	Normalize head weights for Baichuan 2 (#1876 )	2023-11-30 20:03:58 -08:00
Woosuk Kwon	d06980dfa7	Fix Baichuan tokenizer error (#1874 )	2023-11-30 18:35:50 -08:00
Adam Brusselback	66785cc05c	Support chat template and `echo` for chat API (#1756 )	2023-11-30 16:43:13 -08:00
Massimiliano Pronesti	05a38612b0	docs: add instruction for langchain (#1162 )	2023-11-30 10:57:44 -08:00
Roy	d27f4bae39	Fix rope cache key error (#1867 )	2023-11-30 08:29:28 -08:00
aisensiy	8d8c2f6ffe	Support max-model-len argument for throughput benchmark (#1858 )	2023-11-30 08:10:24 -08:00
Woosuk Kwon	51d3cb951d	Remove max_num_seqs in latency benchmark script (#1855 )	2023-11-30 00:00:32 -08:00
Woosuk Kwon	e74b1736a1	Add profile option to latency benchmark script (#1839 )	2023-11-29 23:42:52 -08:00
Allen	f07c1ceaa5	[FIX] Fix docker build error (#1831 ) (#1832 ) Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>	2023-11-29 23:06:50 -08:00
Jee Li	63b2206ad0	Avoid multiple instantiations of the RoPE class (#1828 )	2023-11-29 23:06:27 -08:00
Woosuk Kwon	27feead2f8	Refactor Worker & InputMetadata (#1843 )	2023-11-29 22:16:37 -08:00
Michael McCulloch	c782195662	Disable Logs Requests should Disable Logging of requests. (#1779 ) Co-authored-by: Michael McCulloch <mjm.gitlab@fastmail.com>	2023-11-29 21:50:02 -08:00
Simon Mo	0f621c2c7d	[Docs] Add information about using shared memory in docker (#1845 )	2023-11-29 18:33:56 -08:00
Woosuk Kwon	a9e4574261	Refactor Attention (#1840 )	2023-11-29 15:37:31 -08:00
FlorianJoncour	0229c386c5	Better integration with Ray Serve (#1821 ) Co-authored-by: FlorianJoncour <florian@zetta-sys.com>	2023-11-29 13:25:43 -08:00
Woosuk Kwon	a7b3e33078	[Fix] Fix RoPE in ChatGLM-32K (#1841 )	2023-11-29 13:01:19 -08:00
Zhuohan Li	e19a64c7ef	[FIX] Fix formatting error in main branch (#1822 )	2023-11-28 16:56:43 -08:00
Zhuohan Li	1cb4ad8de9	[FIX] Fix formatting error	2023-11-29 00:40:19 +00:00
explainerauthors	6ed068a71a	Use the type BlockTable (#1791 )	2023-11-28 16:34:05 -08:00
Zhuohan Li	708e6c18b0	[FIX] Fix class naming (#1803 )	2023-11-28 14:08:01 -08:00
Woosuk Kwon	b943890484	Fix OPT param names (#1819 )	2023-11-28 11:22:44 -08:00
explainerauthors	a1125ad4df	Correct comments in parallel_state.py (#1818 )	2023-11-28 10:19:35 -08:00
ljss	a8b150c595	Init model on GPU to reduce CPU memory footprint (#1796 )	2023-11-27 11:18:26 -08:00
Yunmo Chen	665cbcec4b	Added echo function to OpenAI API server. (#1504 )	2023-11-26 21:29:17 -08:00
Woosuk Kwon	7c600440f7	Fix model docstrings (#1764 )	2023-11-23 23:04:44 -08:00
Yanming W	e0c6f556e8	[Build] Avoid building too many extensions (#1624 )	2023-11-23 16:31:19 -08:00
ljss	de23687d16	Fix repetition penalty aligned with huggingface (#1577 )	2023-11-22 14:41:44 -08:00
ljss	4cea74c73b	Set top_p=0 and top_k=-1 in greedy sampling (#1748 )	2023-11-22 12:51:09 -08:00
Casper	a921d8be9d	[DOCS] Add engine args documentation (#1741 )	2023-11-22 12:31:27 -08:00
陈序	094f716bf2	Add stop_token_ids in SamplingParams.__repr__ (#1745 )	2023-11-21 20:13:53 -08:00
Zhuohan Li	7d761fe3c1	[FIX] Fix the case when `input_is_parallel=False` for `ScaledActivation` (#1737 )	2023-11-20 23:56:48 -08:00
Woosuk Kwon	cf35d8f3d7	[BugFix] Fix TP support for AWQ (#1731 )	2023-11-20 21:42:45 -08:00
boydfd	4bb6b67188	fix RAM OOM when load large models in tensor parallel mode. (#1395 ) Co-authored-by: ran_lin <rlin@thoughtworks.com>	2023-11-20 19:02:42 -08:00
ljss	819b18e7ba	Rewrite torch.repeat_interleave to remove cpu synchronization (#1599 )	2023-11-20 17:46:32 -08:00
Zhuofan	19849db573	[Fix] Fix bugs in scheduler (#1727 )	2023-11-20 16:10:50 -08:00
陈序	3d4ceb292c	Fix hanging in the scheduler caused by long prompts (#1534 )	2023-11-20 16:06:49 -08:00
Woosuk Kwon	f5a37c6c6c	[BugFix] Fix a bug in loading safetensors (#1732 )	2023-11-20 15:51:18 -08:00
Zhuohan Li	32c927b53f	[FIX] Update the doc link in README.md (#1730 )	2023-11-20 12:46:24 -08:00
Simon Mo	5ffc0d13a2	Migrate linter from `pylint` to `ruff` (#1665 )	2023-11-20 11:58:01 -08:00
Wen Sun	112627e8b2	[Docs] Fix the code block's format in deploying_with_docker page (#1722 )	2023-11-20 01:22:39 -08:00
Simon Mo	37c1e3c218	Documentation about official docker image (#1709 )	2023-11-19 20:56:26 -08:00
Woosuk Kwon	06e9ebebd5	Add instructions to install vLLM+cu118 (#1717 )	2023-11-18 23:48:58 -08:00
Woosuk Kwon	c5f7740d89	Bump up to v0.2.2 (#1689 ) Some checks failed Create Release / Create Release (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.10, 2.1.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.11, 2.1.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.8, 2.1.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.9, 2.1.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.10, 2.1.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.11, 2.1.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.8, 2.1.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.9, 2.1.0) (push) Has been cancelled Details	2023-11-18 21:57:07 -08:00
Woosuk Kwon	be66d9b125	Fix warning msg on quantization (#1715 )	2023-11-18 21:49:55 -08:00
ljss	e1054247ba	[Optimization] Implement fused add rmsnorm (#1667 )	2023-11-18 18:18:02 -08:00
Woosuk Kwon	8d17774f92	Add AWQ support for all models (#1714 )	2023-11-18 17:56:47 -08:00
twaka	e946260cf3	use get_tensor in safe_open (#1696 )	2023-11-18 16:45:18 -08:00
liuyhwangyh	edb305584b	Support download models from www.modelscope.cn (#1588 )	2023-11-17 20:38:31 -08:00
Woosuk Kwon	bb00f66e19	Use `quantization_config` in hf config (#1695 )	2023-11-17 16:23:49 -08:00
Roy	e87557b069	Support Min P Sampler (#1642 )	2023-11-17 16:20:49 -08:00
Zhuofan	dcc543a298	[Minor] Fix comment (#1704 )	2023-11-17 09:42:49 -08:00
Zhuohan Li	0fc280b06c	Update the adding-model doc according to the new refactor (#1692 )	2023-11-16 18:46:26 -08:00
Zhuohan Li	20d0699d49	[Fix] Fix comm test (#1691 )	2023-11-16 16:28:39 -08:00
Iskren Ivov Chernev	686f5e3210	Return usage for openai streaming requests (#1663 )	2023-11-16 15:28:36 -08:00
Zhuohan Li	415d109527	[Fix] Update Supported Models List (#1690 )	2023-11-16 14:47:26 -08:00
maximzubkov	521b35f799	Support Microsoft Phi 1.5 (#1664 )	2023-11-16 14:28:39 -08:00
Simon Mo	cb08cd0d75	[Minor] Fix duplication of ignored seq group in engine step (#1666 )	2023-11-16 13:11:41 -08:00
twaka	2a2c135b41	Fix loading error when safetensors contains empty tensor (#1687 )	2023-11-16 10:38:10 -08:00
Aaron Pham	65ea2ddf17	feat(config): support parsing torch.dtype (#1641 ) Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>	2023-11-16 01:31:06 -08:00
Megha Agarwal	b514d3c496	Revert `MptConfig` to `MPTConfig` (#1668 )	2023-11-16 01:19:39 -08:00
Zhuohan Li	7076fa1c9f	TP/quantization/weight loading refactor part 2 - Refactor quantized linear logic and extend quantization support to all models (#1622 ) Refactor the tensor parallelism, quantization, and weight-loading codes. Summary of the new features enabled by this PR: - All models are able to be quantized with AWQ and SqueezeLLM, and [soon GPTQ](https://github.com/vllm-project/vllm/pull/1580). - Model loading code became much simpler. - Support model parallelism for all MQA/GQA models when the number of key/value heads is smaller than the tensor parallel size.	2023-11-15 22:50:41 -08:00
Woosuk Kwon	660a7fcfa4	Add DeepSpeed MII backend to benchmark script (#1649 )	2023-11-14 12:35:30 -08:00
Woosuk Kwon	054072bee5	[Minor] Move RoPE selection logic to `get_rope` (#1633 )	2023-11-12 16:04:50 -08:00
lirui	eb825c1e74	Fix #1474 - AssertionError:assert param_slice.shape == loaded_weight.shape (#1631 )	2023-11-12 15:53:12 -08:00
Dominik Schwabe	1b290ace4f	Run default _AsyncLLMEngine._run_workers_async in threadpool (#1628 )	2023-11-11 14:50:44 -08:00
Sin	0d578228ca	config parser: add ChatGLM2 seq_length to `_get_and_verify_max_len` (#1617 )	2023-11-09 19:29:51 -08:00
GhaziSyed	aebfcb262a	Dockerfile: Upgrade Cuda to 12.1 (#1609 )	2023-11-09 11:49:02 -08:00
forpanyang	ab9e8488d5	Add Yi model to quantization support (#1600 )	2023-11-09 11:47:14 -08:00
Woosuk Kwon	fd58b73a40	Build CUDA11.8 wheels for release (#1596 )	2023-11-09 03:52:29 -08:00
Yanming W	8efe23f150	Fix input_metadata.selected_token_indices in worker prepare_inputs (#1546 )	2023-11-08 14:19:12 -08:00
Zhuohan Li	06458a0b42	Upgrade to CUDA 12 (#1527 ) Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2023-11-08 14:17:49 -08:00
GoHomeToMacDonal	1a2bbc9301	ChatGLM Support (#1261 )	2023-11-06 16:09:33 -08:00
Roy	e7f579eb97	Support Yi model (#1567 )	2023-11-06 15:26:03 -08:00
Casper	8516999495	Add Quantization and AutoAWQ to docs (#1235 )	2023-11-04 22:43:39 -07:00
Antoni Baum	9f669a9a7c	Support YaRN models (#1264 ) Signed-off-by: Antoni Baum <antoni.baum@protonmail.com> Co-authored-by: Viktor Ferenczi <viktor@ferenczi.eu> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2023-11-03 14:12:48 -07:00
Noam Gat	555bdcc5a3	Added logits processor API to sampling params (#1469 )	2023-11-03 14:12:15 -07:00
lots-o	54ca1ba71d	docs: add description (#1553 )	2023-11-03 09:14:52 -07:00
Antoni Baum	9738b84a08	Force paged attention v2 for long contexts (#1510 )	2023-11-01 16:24:32 -07:00
Woosuk Kwon	1fe0990023	Remove `MPTConfig` (#1529 )	2023-11-01 15:29:05 -07:00
Fluder-Paradyne	7e90a2d117	Add `/health` Endpoint for both Servers (#1540 )	2023-11-01 10:29:44 -07:00
ljss	5687d584fe	[BugFix] Set engine_use_ray=True when TP>1 (#1531 )	2023-11-01 02:14:18 -07:00
Wenfei Yan	cf8849f2d6	Add `MptForCausalLM` key in model_loader (#1526 )	2023-10-31 15:46:53 -07:00
Cade Daniel	e575df33b1	[Small] Formatter only checks lints in changed files (#1528 )	2023-10-31 15:39:38 -07:00
Woosuk Kwon	0ce8647dc5	Fix integer overflows in attention & cache ops (#1514 )	2023-10-31 15:19:30 -07:00
Stephen Krider	9cabcb7645	Add Dockerfile (#1350 )	2023-10-31 12:36:47 -07:00
Zhuohan Li	7b895c5976	[Fix] Fix duplicated logging messages (#1524 )	2023-10-31 09:04:47 -07:00
Dan Lord	7013a80170	Add support for `spaces_between_special_tokens`	2023-10-30 16:52:56 -07:00
Jared Roesch	79a30912b8	Add py.typed so consumers of vLLM can get type checking (#1509 ) * Add py.typed so consumers of vLLM can get type checking * Update py.typed --------- Co-authored-by: aarnphm <29749331+aarnphm@users.noreply.github.com> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>	2023-10-30 14:50:47 -07:00
Adam Brusselback	2f3d36a8a1	Fix logging so we actually get info level entries in the log. (#1494 )	2023-10-30 10:02:21 -07:00
iongpt	ac8d36f3e5	Refactor LLMEngine demo script for clarity and modularity (#1413 ) Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>	2023-10-30 09:14:37 -07:00
Antoni Baum	15f5632365	Delay GPU->CPU sync in sampling (#1337 )	2023-10-30 09:01:34 -07:00
Woosuk Kwon	aa9af07cac	Fix bias in InternLM (#1501 )	2023-10-29 16:24:18 -07:00
ljss	69be658bba	Support repetition_penalty (#1424 )	2023-10-29 10:02:41 -07:00
Ricardo Lu	beac8dd461	fix: don't skip first special token. (#1497 )	2023-10-29 04:26:36 -07:00
Qing	28b47d1e49	Add rope_scaling to Aquila model (#1457 )	2023-10-29 04:25:21 -07:00
chooper1	1f24755bf8	Support SqueezeLLM (#1326 ) Co-authored-by: squeeze-ai-lab <squeezeailab.bair@gmail.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2023-10-21 23:14:59 -07:00
Thiago Salvatore	bf31d3606a	Pin pydantic dependency versions (#1429 )	2023-10-21 11:18:58 -07:00
Wang Ran (汪然)	d189170b6c	remove useless statements (#1408 )	2023-10-20 08:52:07 -07:00
Light Lin	f61dc8072f	Fix type hints (#1427 )	2023-10-20 08:50:47 -07:00
Woosuk Kwon	f8a1e39fae	[BugFix] Define `__eq__` in SequenceGroupOutputs (#1389 )	2023-10-17 01:09:44 -07:00
Wang Ran (汪然)	a132435204	Fix typo (#1383 )	2023-10-16 21:53:37 -07:00
Woosuk Kwon	9524867701	Add Mistral 7B to `test_models` (#1366 )	2023-10-16 17:49:54 -07:00
Woosuk Kwon	c1376e0f82	Change scheduler & input tensor shape (#1381 )	2023-10-16 17:48:42 -07:00
Zhuohan Li	651c614aa4	Bump up the version to v0.2.1 (#1355 ) Some checks failed Create Release / Create Release (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.10, 2.0.1) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.11, 2.0.1) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.8, 2.0.1) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.9, 2.0.1) (push) Has been cancelled Details	2023-10-16 12:58:57 -07:00
Woosuk Kwon	d3a5bd9fb7	Fix sampler test (#1379 )	2023-10-16 12:57:26 -07:00
Woosuk Kwon	e8ef4c0820	Fix PyTorch index URL in workflow (#1378 )	2023-10-16 12:37:56 -07:00
Woosuk Kwon	348897af31	Fix PyTorch version to 2.0.1 in workflow (#1377 )	2023-10-16 11:27:17 -07:00
Zhuohan Li	9d9072a069	Implement prompt logprobs & Batched topk for computing logprobs (#1328 ) Co-authored-by: Yunmo Chen <16273544+wanmok@users.noreply.github.com>	2023-10-16 10:56:50 -07:00
Woosuk Kwon	928de46888	Implement PagedAttention V2 (#1348 )	2023-10-16 00:59:57 -07:00
Woosuk Kwon	29678cd213	Minor fix on AWQ kernel launch (#1356 )	2023-10-15 21:53:56 -07:00
Woosuk Kwon	d0740dff1b	Fix error message on `TORCH_CUDA_ARCH_LIST` (#1239 ) Co-authored-by: Yunfeng Bai <yunfeng.bai@scale.com>	2023-10-14 14:47:43 -07:00
Lu Wang	de89472897	Fix the issue for AquilaChat2-* models (#1339 )	2023-10-13 11:51:29 -07:00
Woosuk Kwon	e7c8555d06	Bump up transformers version & Remove MistralConfig (#1254 )	2023-10-13 10:05:26 -07:00
Antoni Baum	ec3b5ce9cc	Improve detokenization performance (#1338 )	2023-10-13 09:59:07 -07:00
ldwang	6368e777a8	Add Aquila2 to README (#1331 ) Signed-off-by: ldwang <ftgreat@gmail.com> Co-authored-by: ldwang <ftgreat@gmail.com>	2023-10-12 12:11:16 -07:00
Woosuk Kwon	875afe38ab	Add blacklist in model checkpoint (#1325 )	2023-10-12 01:05:37 -07:00
amaleshvemula	ee8217e5be	Add Mistral to quantization model list (#1278 )	2023-10-11 00:26:24 -07:00
CHU Tianxiang	980dd4a2c4	Fix overflow in awq kernel (#1295 ) Co-authored-by: 楚天翔 <tianxiang.ctx@alibaba-inc.com>	2023-10-11 00:19:53 -07:00
twaka	8285736840	workaround of AWQ for Turing GPUs (#1252 )	2023-10-10 19:48:16 -07:00
yhlskt23	91fce82c6f	change the timing of sorting logits (#1309 )	2023-10-10 19:37:42 -07:00
Wang Ran (汪然)	ac5cf86aa6	Fix `__repr__` of `SequenceOutputs` (#1311 )	2023-10-10 09:58:28 -07:00
yanxiyue	6a6119554c	lock torch version to 2.0.1 (#1290 )	2023-10-10 09:21:57 -07:00
Zhuohan Li	b95ee898fe	[Minor] Fix comment in mistral.py (#1303 )	2023-10-09 19:44:37 -07:00
Zhuohan Li	9eed4d1f3e	Update README.md (#1292 )	2023-10-08 23:15:50 -07:00
Zhuohan Li	6b5296aa3a	[FIX] Explain why the finished_reason of ignored sequences are length (#1289 )	2023-10-08 15:22:38 -07:00
Antoni Baum	ee92b58b3a	Move bfloat16 check to worker (#1259 )	2023-10-07 22:10:44 -07:00
Yunfeng Bai	09ff7f106a	API server support ipv4 / ipv6 dualstack (#1288 ) Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>	2023-10-07 15:15:54 -07:00
Antoni Baum	acbed3ef40	Use monotonic time where appropriate (#1249 )	2023-10-02 19:22:05 -07:00
Federico Cassano	66d18a7fb0	add support for tokenizer revision (#1163 ) Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>	2023-10-02 19:19:46 -07:00
Zhuohan Li	ba0bfd40e2	TP/quantization/weight loading refactor part 1 - Simplify parallel linear logic (#1181 )	2023-10-02 15:36:09 -07:00
Woosuk Kwon	84e4e37d14	[Minor] Fix type annotations (#1238 )	2023-10-02 15:28:31 -07:00
Zhuohan Li	a60b353005	support sharding llama2-70b on more than 8 GPUs (#1209 ) Co-authored-by: JiCheng <247153481@qq.com>	2023-10-02 15:26:33 -07:00
Liang	ebe4d1db3a	Fix boundary check in paged attention kernel (#1241 )	2023-10-01 11:35:06 -07:00
kg6-sleipnir	b5a10eb0ef	Added `dtype` arg to benchmarks (#1228 )	2023-09-30 21:04:03 -07:00
Usama Ahmed	0967102c6d	fixing typo in `tiiuae/falcon-rw-7b` model name (#1226 )	2023-09-29 13:40:25 -07:00
		`@@ -0,0 +1,2 @@`
							`{% for message in messages %}{{'<\|im_start\|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<\|im_end\|>' + '\n'}}{% endif %}{% endfor %}`
							`{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<\|im_start\|>assistant\n' }}{% endif %}`