Compare commits
208 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0f90effc66 | ||
|
|
464dd985e3 | ||
|
|
c07a442854 | ||
|
|
cd3aa153a4 | ||
|
|
9b294976a2 | ||
|
|
5313c2cb8b | ||
|
|
5f09cbdb63 | ||
|
|
4cefa9b49b | ||
|
|
f86bd6190a | ||
|
|
e5452ddfd6 | ||
|
|
d06980dfa7 | ||
|
|
66785cc05c | ||
|
|
05a38612b0 | ||
|
|
d27f4bae39 | ||
|
|
8d8c2f6ffe | ||
|
|
51d3cb951d | ||
|
|
e74b1736a1 | ||
|
|
f07c1ceaa5 | ||
|
|
63b2206ad0 | ||
|
|
27feead2f8 | ||
|
|
c782195662 | ||
|
|
0f621c2c7d | ||
|
|
a9e4574261 | ||
|
|
0229c386c5 | ||
|
|
a7b3e33078 | ||
|
|
e19a64c7ef | ||
|
|
1cb4ad8de9 | ||
|
|
6ed068a71a | ||
|
|
708e6c18b0 | ||
|
|
b943890484 | ||
|
|
a1125ad4df | ||
|
|
a8b150c595 | ||
|
|
665cbcec4b | ||
|
|
7c600440f7 | ||
|
|
e0c6f556e8 | ||
|
|
de23687d16 | ||
|
|
4cea74c73b | ||
|
|
a921d8be9d | ||
|
|
094f716bf2 | ||
|
|
7d761fe3c1 | ||
|
|
cf35d8f3d7 | ||
|
|
4bb6b67188 | ||
|
|
819b18e7ba | ||
|
|
19849db573 | ||
|
|
3d4ceb292c | ||
|
|
f5a37c6c6c | ||
|
|
32c927b53f | ||
|
|
5ffc0d13a2 | ||
|
|
112627e8b2 | ||
|
|
37c1e3c218 | ||
|
|
06e9ebebd5 | ||
|
|
c5f7740d89 | ||
|
|
be66d9b125 | ||
|
|
e1054247ba | ||
|
|
8d17774f92 | ||
|
|
e946260cf3 | ||
|
|
edb305584b | ||
|
|
bb00f66e19 | ||
|
|
e87557b069 | ||
|
|
dcc543a298 | ||
|
|
0fc280b06c | ||
|
|
20d0699d49 | ||
|
|
686f5e3210 | ||
|
|
415d109527 | ||
|
|
521b35f799 | ||
|
|
cb08cd0d75 | ||
|
|
2a2c135b41 | ||
|
|
65ea2ddf17 | ||
|
|
b514d3c496 | ||
|
|
7076fa1c9f | ||
|
|
660a7fcfa4 | ||
|
|
054072bee5 | ||
|
|
eb825c1e74 | ||
|
|
1b290ace4f | ||
|
|
0d578228ca | ||
|
|
aebfcb262a | ||
|
|
ab9e8488d5 | ||
|
|
fd58b73a40 | ||
|
|
8efe23f150 | ||
|
|
06458a0b42 | ||
|
|
1a2bbc9301 | ||
|
|
e7f579eb97 | ||
|
|
8516999495 | ||
|
|
9f669a9a7c | ||
|
|
555bdcc5a3 | ||
|
|
54ca1ba71d | ||
|
|
9738b84a08 | ||
|
|
1fe0990023 | ||
|
|
7e90a2d117 | ||
|
|
5687d584fe | ||
|
|
cf8849f2d6 | ||
|
|
e575df33b1 | ||
|
|
0ce8647dc5 | ||
|
|
9cabcb7645 | ||
|
|
7b895c5976 | ||
|
|
7013a80170 | ||
|
|
79a30912b8 | ||
|
|
2f3d36a8a1 | ||
|
|
ac8d36f3e5 | ||
|
|
15f5632365 | ||
|
|
aa9af07cac | ||
|
|
69be658bba | ||
|
|
beac8dd461 | ||
|
|
28b47d1e49 | ||
|
|
1f24755bf8 | ||
|
|
bf31d3606a | ||
|
|
d189170b6c | ||
|
|
f61dc8072f | ||
|
|
f8a1e39fae | ||
|
|
a132435204 | ||
|
|
9524867701 | ||
|
|
c1376e0f82 | ||
|
|
651c614aa4 | ||
|
|
d3a5bd9fb7 | ||
|
|
e8ef4c0820 | ||
|
|
348897af31 | ||
|
|
9d9072a069 | ||
|
|
928de46888 | ||
|
|
29678cd213 | ||
|
|
d0740dff1b | ||
|
|
de89472897 | ||
|
|
e7c8555d06 | ||
|
|
ec3b5ce9cc | ||
|
|
6368e777a8 | ||
|
|
875afe38ab | ||
|
|
ee8217e5be | ||
|
|
980dd4a2c4 | ||
|
|
8285736840 | ||
|
|
91fce82c6f | ||
|
|
ac5cf86aa6 | ||
|
|
6a6119554c | ||
|
|
b95ee898fe | ||
|
|
9eed4d1f3e | ||
|
|
6b5296aa3a | ||
|
|
ee92b58b3a | ||
|
|
09ff7f106a | ||
|
|
acbed3ef40 | ||
|
|
66d18a7fb0 | ||
|
|
ba0bfd40e2 | ||
|
|
84e4e37d14 | ||
|
|
a60b353005 | ||
|
|
ebe4d1db3a | ||
|
|
b5a10eb0ef | ||
|
|
0967102c6d | ||
|
|
e2fb71ec9f | ||
|
|
f936657eb6 | ||
|
|
6f88f762bf | ||
|
|
202351d5bf | ||
|
|
2e8e49fce3 | ||
|
|
a8e98aee0c | ||
|
|
bb1ba58f06 | ||
|
|
7bedab5748 | ||
|
|
20f7cc4cde | ||
|
|
649aa730c5 | ||
|
|
a19bc5c628 | ||
|
|
28e616c4e3 | ||
|
|
30e775281d | ||
|
|
21877b0d75 | ||
|
|
cf5cb1e33e | ||
|
|
03ffd0a022 | ||
|
|
a425bd9a9a | ||
|
|
bbbf86565f | ||
|
|
9f6be8692e | ||
|
|
f187877945 | ||
|
|
947b794146 | ||
|
|
8d926e91f1 | ||
|
|
4ee52bb169 | ||
|
|
7d7e3b78a3 | ||
|
|
f98b745a81 | ||
|
|
2d1e86f1b1 | ||
|
|
1ac4ccf73c | ||
|
|
2ac4d5e2bf | ||
|
|
3302f0aef3 | ||
|
|
6f2dd6c37e | ||
|
|
bc0644574c | ||
|
|
400b8289f7 | ||
|
|
c1026311b5 | ||
|
|
2b1c116b5a | ||
|
|
cc796b1358 | ||
|
|
f029ef94d7 | ||
|
|
95592fa00a | ||
|
|
fbe66e1d0b | ||
|
|
90979c38f8 | ||
|
|
e21d7687a9 | ||
|
|
ff36139ffc | ||
|
|
e3e79e9e8a | ||
|
|
b9fe4616f9 | ||
|
|
64ca424e75 | ||
|
|
b5f93d0631 | ||
|
|
a58936966f | ||
|
|
dd54a4b026 | ||
|
|
eda1a7cad3 | ||
|
|
f04908cae7 | ||
|
|
ab019eea75 | ||
|
|
9841d48a10 | ||
|
|
3272d7a0b7 | ||
|
|
0bb1e885a0 | ||
|
|
d6545ad22e | ||
|
|
90eb3f43ca | ||
|
|
e67b4f2c2a | ||
|
|
d6770d1f23 | ||
|
|
b9cecc2635 | ||
|
|
898285c9bf | ||
|
|
a62de9ecfd | ||
|
|
4042d192f5 | ||
|
|
1117aa1411 | ||
|
|
080438477f | ||
|
|
4b5bcf8906 |
11
.github/workflows/publish.yml
vendored
@@ -43,13 +43,14 @@ jobs:
|
|||||||
name: Build Wheel
|
name: Build Wheel
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
needs: release
|
needs: release
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
os: ['ubuntu-20.04']
|
os: ['ubuntu-20.04']
|
||||||
python-version: ['3.8', '3.9', '3.10', '3.11']
|
python-version: ['3.8', '3.9', '3.10', '3.11']
|
||||||
cuda-version: ['11.8'] # Github runner can't build anything older than 11.8
|
pytorch-version: ['2.1.0']
|
||||||
|
cuda-version: ['11.8', '12.1']
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
@@ -69,9 +70,9 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
|
bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
|
||||||
|
|
||||||
- name: Install PyTorch-cu${{ matrix.cuda-version }}
|
- name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
|
||||||
run: |
|
run: |
|
||||||
bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
|
bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
|
||||||
|
|
||||||
- name: Build wheel
|
- name: Build wheel
|
||||||
shell: bash
|
shell: bash
|
||||||
@@ -81,7 +82,7 @@ jobs:
|
|||||||
asset_name=${wheel_name//"linux"/"manylinux1"}
|
asset_name=${wheel_name//"linux"/"manylinux1"}
|
||||||
echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
|
echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
|
||||||
echo "asset_name=${asset_name}" >> $GITHUB_ENV
|
echo "asset_name=${asset_name}" >> $GITHUB_ENV
|
||||||
|
|
||||||
- name: Upload Release Asset
|
- name: Upload Release Asset
|
||||||
uses: actions/upload-release-asset@v1
|
uses: actions/upload-release-asset@v1
|
||||||
env:
|
env:
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
name: pylint
|
name: ruff
|
||||||
|
|
||||||
on:
|
on:
|
||||||
# Trigger the workflow on push or pull request,
|
# Trigger the workflow on push or pull request,
|
||||||
@@ -11,7 +11,7 @@ on:
|
|||||||
- main
|
- main
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
pylint:
|
ruff:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
@@ -25,7 +25,7 @@ jobs:
|
|||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install pylint==2.8.2
|
pip install ruff==0.1.5
|
||||||
- name: Analysing the code with pylint
|
- name: Analysing the code with ruff
|
||||||
run: |
|
run: |
|
||||||
pylint vllm
|
ruff vllm tests
|
||||||
3
.github/workflows/scripts/build.sh
vendored
@@ -11,5 +11,8 @@ LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
|
|||||||
$python_executable -m pip install wheel packaging
|
$python_executable -m pip install wheel packaging
|
||||||
$python_executable -m pip install -r requirements.txt
|
$python_executable -m pip install -r requirements.txt
|
||||||
|
|
||||||
|
# Limit the number of parallel jobs to avoid OOM
|
||||||
|
export MAX_JOBS=1
|
||||||
|
|
||||||
# Build
|
# Build
|
||||||
$python_executable setup.py bdist_wheel --dist-dir=dist
|
$python_executable setup.py bdist_wheel --dist-dir=dist
|
||||||
|
|||||||
5
.github/workflows/scripts/cuda-install.sh
vendored
@@ -16,3 +16,8 @@ sudo apt clean
|
|||||||
# Test nvcc
|
# Test nvcc
|
||||||
PATH=/usr/local/cuda-$1/bin:${PATH}
|
PATH=/usr/local/cuda-$1/bin:${PATH}
|
||||||
nvcc --version
|
nvcc --version
|
||||||
|
|
||||||
|
# Log gcc, g++, c++ versions
|
||||||
|
gcc --version
|
||||||
|
g++ --version
|
||||||
|
c++ --version
|
||||||
|
|||||||
5
.github/workflows/scripts/pytorch-install.sh
vendored
@@ -1,11 +1,12 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
python_executable=python$1
|
python_executable=python$1
|
||||||
cuda_version=$2
|
pytorch_version=$2
|
||||||
|
cuda_version=$3
|
||||||
|
|
||||||
# Install torch
|
# Install torch
|
||||||
$python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
|
$python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
|
||||||
$python_executable -m pip install torch -f https://download.pytorch.org/whl/cu${cuda_version//./}/torch_stable.html
|
$python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}
|
||||||
|
|
||||||
# Print version information
|
# Print version information
|
||||||
$python_executable --version
|
$python_executable --version
|
||||||
|
|||||||
2
.github/workflows/yapf.yml
vendored
@@ -28,4 +28,4 @@ jobs:
|
|||||||
pip install toml==0.10.2
|
pip install toml==0.10.2
|
||||||
- name: Running yapf
|
- name: Running yapf
|
||||||
run: |
|
run: |
|
||||||
yapf --diff --recursive vllm --exclude 'vllm/model_executor/parallel_utils/**'
|
yapf --diff --recursive vllm tests
|
||||||
|
|||||||
4
.gitignore
vendored
@@ -173,3 +173,7 @@ cython_debug/
|
|||||||
|
|
||||||
# Sphinx documentation
|
# Sphinx documentation
|
||||||
_build/
|
_build/
|
||||||
|
|
||||||
|
# vim swap files
|
||||||
|
*.swo
|
||||||
|
*.swp
|
||||||
|
|||||||
434
.pylintrc
@@ -1,434 +0,0 @@
|
|||||||
# This Pylint rcfile contains a best-effort configuration to uphold the
|
|
||||||
# best-practices and style described in the Google Python style guide:
|
|
||||||
# https://google.github.io/styleguide/pyguide.html
|
|
||||||
#
|
|
||||||
# Its canonical open-source location is:
|
|
||||||
# https://google.github.io/styleguide/pylintrc
|
|
||||||
|
|
||||||
[MASTER]
|
|
||||||
|
|
||||||
# Files or directories to be skipped. They should be base names, not paths.
|
|
||||||
ignore=docs,parallel_utils
|
|
||||||
|
|
||||||
# Files or directories matching the regex patterns are skipped. The regex
|
|
||||||
# matches against base names, not paths.
|
|
||||||
ignore-patterns=
|
|
||||||
|
|
||||||
# Pickle collected data for later comparisons.
|
|
||||||
persistent=no
|
|
||||||
|
|
||||||
# List of plugins (as comma separated values of python modules names) to load,
|
|
||||||
# usually to register additional checkers.
|
|
||||||
load-plugins=
|
|
||||||
|
|
||||||
# Use multiple processes to speed up Pylint.
|
|
||||||
jobs=4
|
|
||||||
|
|
||||||
# Allow loading of arbitrary C extensions. Extensions are imported into the
|
|
||||||
# active Python interpreter and may run arbitrary code.
|
|
||||||
unsafe-load-any-extension=no
|
|
||||||
|
|
||||||
|
|
||||||
[MESSAGES CONTROL]
|
|
||||||
|
|
||||||
# Only show warnings with the listed confidence levels. Leave empty to show
|
|
||||||
# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
|
|
||||||
confidence=
|
|
||||||
|
|
||||||
# Enable the message, report, category or checker with the given id(s). You can
|
|
||||||
# either give multiple identifier separated by comma (,) or put this option
|
|
||||||
# multiple time (only on the command line, not in the configuration file where
|
|
||||||
# it should appear only once). See also the "--disable" option for examples.
|
|
||||||
#enable=
|
|
||||||
|
|
||||||
# Disable the message, report, category or checker with the given id(s). You
|
|
||||||
# can either give multiple identifiers separated by comma (,) or put this
|
|
||||||
# option multiple times (only on the command line, not in the configuration
|
|
||||||
# file where it should appear only once).You can also use "--disable=all" to
|
|
||||||
# disable everything first and then reenable specific checks. For example, if
|
|
||||||
# you want to run only the similarities checker, you can use "--disable=all
|
|
||||||
# --enable=similarities". If you want to run only the classes checker, but have
|
|
||||||
# no Warning level messages displayed, use"--disable=all --enable=classes
|
|
||||||
# --disable=W"
|
|
||||||
disable=abstract-method,
|
|
||||||
apply-builtin,
|
|
||||||
arguments-differ,
|
|
||||||
attribute-defined-outside-init,
|
|
||||||
backtick,
|
|
||||||
bad-option-value,
|
|
||||||
basestring-builtin,
|
|
||||||
buffer-builtin,
|
|
||||||
c-extension-no-member,
|
|
||||||
consider-using-enumerate,
|
|
||||||
cmp-builtin,
|
|
||||||
cmp-method,
|
|
||||||
coerce-builtin,
|
|
||||||
coerce-method,
|
|
||||||
delslice-method,
|
|
||||||
div-method,
|
|
||||||
duplicate-code,
|
|
||||||
eq-without-hash,
|
|
||||||
execfile-builtin,
|
|
||||||
file-builtin,
|
|
||||||
filter-builtin-not-iterating,
|
|
||||||
fixme,
|
|
||||||
getslice-method,
|
|
||||||
global-statement,
|
|
||||||
hex-method,
|
|
||||||
idiv-method,
|
|
||||||
implicit-str-concat-in-sequence,
|
|
||||||
import-error,
|
|
||||||
import-self,
|
|
||||||
import-star-module-level,
|
|
||||||
inconsistent-return-statements,
|
|
||||||
input-builtin,
|
|
||||||
intern-builtin,
|
|
||||||
invalid-str-codec,
|
|
||||||
locally-disabled,
|
|
||||||
logging-fstring-interpolation, # added by vLLM
|
|
||||||
logging-not-lazy, # added by vLLM
|
|
||||||
long-builtin,
|
|
||||||
long-suffix,
|
|
||||||
map-builtin-not-iterating,
|
|
||||||
misplaced-comparison-constant,
|
|
||||||
missing-class-docstring, # TODO (vLLM): enable
|
|
||||||
missing-function-docstring,
|
|
||||||
missing-module-docstring, # TODO (vLLM): enable
|
|
||||||
metaclass-assignment,
|
|
||||||
next-method-called,
|
|
||||||
next-method-defined,
|
|
||||||
no-absolute-import,
|
|
||||||
no-else-break,
|
|
||||||
no-else-continue,
|
|
||||||
no-else-raise,
|
|
||||||
no-else-return,
|
|
||||||
no-init, # added
|
|
||||||
no-member,
|
|
||||||
no-name-in-module,
|
|
||||||
no-self-use,
|
|
||||||
nonzero-method,
|
|
||||||
oct-method,
|
|
||||||
old-division,
|
|
||||||
old-ne-operator,
|
|
||||||
old-octal-literal,
|
|
||||||
old-raise-syntax,
|
|
||||||
parameter-unpacking,
|
|
||||||
print-statement,
|
|
||||||
raising-string,
|
|
||||||
range-builtin-not-iterating,
|
|
||||||
raw_input-builtin,
|
|
||||||
rdiv-method,
|
|
||||||
reduce-builtin,
|
|
||||||
relative-import,
|
|
||||||
reload-builtin,
|
|
||||||
round-builtin,
|
|
||||||
setslice-method,
|
|
||||||
signature-differs,
|
|
||||||
standarderror-builtin,
|
|
||||||
suppressed-message,
|
|
||||||
sys-max-int,
|
|
||||||
too-few-public-methods,
|
|
||||||
too-many-ancestors,
|
|
||||||
too-many-arguments,
|
|
||||||
too-many-boolean-expressions,
|
|
||||||
too-many-branches,
|
|
||||||
too-many-instance-attributes,
|
|
||||||
too-many-locals,
|
|
||||||
too-many-nested-blocks,
|
|
||||||
too-many-public-methods,
|
|
||||||
too-many-return-statements,
|
|
||||||
too-many-statements,
|
|
||||||
trailing-newlines,
|
|
||||||
unichr-builtin,
|
|
||||||
unicode-builtin,
|
|
||||||
unnecessary-pass,
|
|
||||||
unpacking-in-except,
|
|
||||||
unspecified-encoding,
|
|
||||||
useless-else-on-loop,
|
|
||||||
useless-object-inheritance,
|
|
||||||
useless-suppression,
|
|
||||||
using-cmp-argument,
|
|
||||||
wrong-import-order,
|
|
||||||
xrange-builtin,
|
|
||||||
zip-builtin-not-iterating,
|
|
||||||
|
|
||||||
|
|
||||||
[REPORTS]
|
|
||||||
|
|
||||||
# Set the output format. Available formats are text, parseable, colorized, msvs
|
|
||||||
# (visual studio) and html. You can also give a reporter class, eg
|
|
||||||
# mypackage.mymodule.MyReporterClass.
|
|
||||||
output-format=text
|
|
||||||
|
|
||||||
# Tells whether to display a full report or only the messages
|
|
||||||
reports=no
|
|
||||||
|
|
||||||
# Python expression which should return a note less than 10 (10 is the highest
|
|
||||||
# note). You have access to the variables errors warning, statement which
|
|
||||||
# respectively contain the number of errors / warnings messages and the total
|
|
||||||
# number of statements analyzed. This is used by the global evaluation report
|
|
||||||
# (RP0004).
|
|
||||||
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
|
|
||||||
|
|
||||||
# Template used to display messages. This is a python new-style format string
|
|
||||||
# used to format the message information. See doc for all details
|
|
||||||
#msg-template=
|
|
||||||
|
|
||||||
|
|
||||||
[BASIC]
|
|
||||||
|
|
||||||
# Good variable names which should always be accepted, separated by a comma
|
|
||||||
good-names=main,_
|
|
||||||
|
|
||||||
# Bad variable names which should always be refused, separated by a comma
|
|
||||||
bad-names=
|
|
||||||
|
|
||||||
# Colon-delimited sets of names that determine each other's naming style when
|
|
||||||
# the name regexes allow several styles.
|
|
||||||
name-group=
|
|
||||||
|
|
||||||
# Include a hint for the correct naming format with invalid-name
|
|
||||||
include-naming-hint=no
|
|
||||||
|
|
||||||
# List of decorators that produce properties, such as abc.abstractproperty. Add
|
|
||||||
# to this list to register other decorators that produce valid properties.
|
|
||||||
property-classes=abc.abstractproperty,cached_property.cached_property,cached_property.threaded_cached_property,cached_property.cached_property_with_ttl,cached_property.threaded_cached_property_with_ttl
|
|
||||||
|
|
||||||
# Regular expression matching correct function names
|
|
||||||
function-rgx=^(?:(?P<exempt>setUp|tearDown|setUpModule|tearDownModule)|(?P<camel_case>_?[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_?[a-z][a-z0-9_]*))$
|
|
||||||
|
|
||||||
# Regular expression matching correct variable names
|
|
||||||
variable-rgx=^[a-z][a-z0-9_]*$
|
|
||||||
|
|
||||||
# Regular expression matching correct constant names
|
|
||||||
const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
|
|
||||||
|
|
||||||
# Regular expression matching correct attribute names
|
|
||||||
attr-rgx=^_{0,2}[a-z][a-z0-9_]*$
|
|
||||||
|
|
||||||
# Regular expression matching correct argument names
|
|
||||||
argument-rgx=^[a-z][a-z0-9_]*$
|
|
||||||
|
|
||||||
# Regular expression matching correct class attribute names
|
|
||||||
class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
|
|
||||||
|
|
||||||
# Regular expression matching correct inline iteration names
|
|
||||||
inlinevar-rgx=^[a-z][a-z0-9_]*$
|
|
||||||
|
|
||||||
# Regular expression matching correct class names
|
|
||||||
class-rgx=^_?[A-Z][a-zA-Z0-9]*$
|
|
||||||
|
|
||||||
# Regular expression matching correct module names
|
|
||||||
module-rgx=^(_?[a-z][a-z0-9_]*|__init__)$
|
|
||||||
|
|
||||||
# Regular expression matching correct method names
|
|
||||||
method-rgx=(?x)^(?:(?P<exempt>_[a-z0-9_]+__|runTest|setUp|tearDown|setUpTestCase|tearDownTestCase|setupSelf|tearDownClass|setUpClass|(test|assert)_*[A-Z0-9][a-zA-Z0-9_]*|next)|(?P<camel_case>_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P<snake_case>_{0,2}[a-z][a-z0-9_]*))$
|
|
||||||
|
|
||||||
# Regular expression which should only match function or class names that do
|
|
||||||
# not require a docstring.
|
|
||||||
no-docstring-rgx=(__.*__|main|test.*|.*test|.*Test)$
|
|
||||||
|
|
||||||
# Minimum line length for functions/classes that require docstrings, shorter
|
|
||||||
# ones are exempt.
|
|
||||||
docstring-min-length=10
|
|
||||||
|
|
||||||
|
|
||||||
[TYPECHECK]
|
|
||||||
|
|
||||||
# List of decorators that produce context managers, such as
|
|
||||||
# contextlib.contextmanager. Add to this list to register other decorators that
|
|
||||||
# produce valid context managers.
|
|
||||||
contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager
|
|
||||||
|
|
||||||
# Tells whether missing members accessed in mixin class should be ignored. A
|
|
||||||
# mixin class is detected if its name ends with "mixin" (case insensitive).
|
|
||||||
ignore-mixin-members=yes
|
|
||||||
|
|
||||||
# List of module names for which member attributes should not be checked
|
|
||||||
# (useful for modules/projects where namespaces are manipulated during runtime
|
|
||||||
# and thus existing member attributes cannot be deduced by static analysis. It
|
|
||||||
# supports qualified module names, as well as Unix pattern matching.
|
|
||||||
ignored-modules=
|
|
||||||
|
|
||||||
# List of class names for which member attributes should not be checked (useful
|
|
||||||
# for classes with dynamically set attributes). This supports the use of
|
|
||||||
# qualified names.
|
|
||||||
ignored-classes=optparse.Values,thread._local,_thread._local
|
|
||||||
|
|
||||||
# List of members which are set dynamically and missed by pylint inference
|
|
||||||
# system, and so shouldn't trigger E1101 when accessed. Python regular
|
|
||||||
# expressions are accepted.
|
|
||||||
generated-members=
|
|
||||||
|
|
||||||
|
|
||||||
[FORMAT]
|
|
||||||
|
|
||||||
# Maximum number of characters on a single line.
|
|
||||||
max-line-length=80
|
|
||||||
|
|
||||||
# TODO(https://github.com/PyCQA/pylint/issues/3352): Direct pylint to exempt
|
|
||||||
# lines made too long by directives to pytype.
|
|
||||||
|
|
||||||
# Regexp for a line that is allowed to be longer than the limit.
|
|
||||||
ignore-long-lines=(?x)(
|
|
||||||
^\s*(\#\ )?<?https?://\S+>?$|
|
|
||||||
^\s*(from\s+\S+\s+)?import\s+.+$)
|
|
||||||
|
|
||||||
# Allow the body of an if to be on the same line as the test if there is no
|
|
||||||
# else.
|
|
||||||
single-line-if-stmt=yes
|
|
||||||
|
|
||||||
# Maximum number of lines in a module
|
|
||||||
max-module-lines=99999
|
|
||||||
|
|
||||||
# String used as indentation unit. The internal Google style guide mandates 2
|
|
||||||
# spaces. Google's externaly-published style guide says 4, consistent with
|
|
||||||
# PEP 8. Here, we use 2 spaces, for conformity with many open-sourced Google
|
|
||||||
# projects (like TensorFlow).
|
|
||||||
indent-string=' '
|
|
||||||
|
|
||||||
# Number of spaces of indent required inside a hanging or continued line.
|
|
||||||
indent-after-paren=4
|
|
||||||
|
|
||||||
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
|
|
||||||
expected-line-ending-format=
|
|
||||||
|
|
||||||
|
|
||||||
[MISCELLANEOUS]
|
|
||||||
|
|
||||||
# List of note tags to take in consideration, separated by a comma.
|
|
||||||
notes=TODO
|
|
||||||
|
|
||||||
|
|
||||||
[STRING]
|
|
||||||
|
|
||||||
# This flag controls whether inconsistent-quotes generates a warning when the
|
|
||||||
# character used as a quote delimiter is used inconsistently within a module.
|
|
||||||
check-quote-consistency=yes
|
|
||||||
|
|
||||||
|
|
||||||
[VARIABLES]
|
|
||||||
|
|
||||||
# Tells whether we should check for unused import in __init__ files.
|
|
||||||
init-import=no
|
|
||||||
|
|
||||||
# A regular expression matching the name of dummy variables (i.e. expectedly
|
|
||||||
# not used).
|
|
||||||
dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_)
|
|
||||||
|
|
||||||
# List of additional names supposed to be defined in builtins. Remember that
|
|
||||||
# you should avoid to define new builtins when possible.
|
|
||||||
additional-builtins=
|
|
||||||
|
|
||||||
# List of strings which can identify a callback function by name. A callback
|
|
||||||
# name must start or end with one of those strings.
|
|
||||||
callbacks=cb_,_cb
|
|
||||||
|
|
||||||
# List of qualified module names which can have objects that can redefine
|
|
||||||
# builtins.
|
|
||||||
redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functools
|
|
||||||
|
|
||||||
|
|
||||||
[LOGGING]
|
|
||||||
|
|
||||||
# Logging modules to check that the string format arguments are in logging
|
|
||||||
# function parameter format
|
|
||||||
logging-modules=logging,absl.logging,tensorflow.io.logging
|
|
||||||
|
|
||||||
|
|
||||||
[SIMILARITIES]
|
|
||||||
|
|
||||||
# Minimum lines number of a similarity.
|
|
||||||
min-similarity-lines=4
|
|
||||||
|
|
||||||
# Ignore comments when computing similarities.
|
|
||||||
ignore-comments=yes
|
|
||||||
|
|
||||||
# Ignore docstrings when computing similarities.
|
|
||||||
ignore-docstrings=yes
|
|
||||||
|
|
||||||
# Ignore imports when computing similarities.
|
|
||||||
ignore-imports=no
|
|
||||||
|
|
||||||
|
|
||||||
[SPELLING]
|
|
||||||
|
|
||||||
# Spelling dictionary name. Available dictionaries: none. To make it working
|
|
||||||
# install python-enchant package.
|
|
||||||
spelling-dict=
|
|
||||||
|
|
||||||
# List of comma separated words that should not be checked.
|
|
||||||
spelling-ignore-words=
|
|
||||||
|
|
||||||
# A path to a file that contains private dictionary; one word per line.
|
|
||||||
spelling-private-dict-file=
|
|
||||||
|
|
||||||
# Tells whether to store unknown words to indicated private dictionary in
|
|
||||||
# --spelling-private-dict-file option instead of raising a message.
|
|
||||||
spelling-store-unknown-words=no
|
|
||||||
|
|
||||||
|
|
||||||
[IMPORTS]
|
|
||||||
|
|
||||||
# Deprecated modules which should not be used, separated by a comma
|
|
||||||
deprecated-modules=regsub,
|
|
||||||
TERMIOS,
|
|
||||||
Bastion,
|
|
||||||
rexec,
|
|
||||||
sets
|
|
||||||
|
|
||||||
# Create a graph of every (i.e. internal and external) dependencies in the
|
|
||||||
# given file (report RP0402 must not be disabled)
|
|
||||||
import-graph=
|
|
||||||
|
|
||||||
# Create a graph of external dependencies in the given file (report RP0402 must
|
|
||||||
# not be disabled)
|
|
||||||
ext-import-graph=
|
|
||||||
|
|
||||||
# Create a graph of internal dependencies in the given file (report RP0402 must
|
|
||||||
# not be disabled)
|
|
||||||
int-import-graph=
|
|
||||||
|
|
||||||
# Force import order to recognize a module as part of the standard
|
|
||||||
# compatibility libraries.
|
|
||||||
known-standard-library=
|
|
||||||
|
|
||||||
# Force import order to recognize a module as part of a third party library.
|
|
||||||
known-third-party=enchant, absl
|
|
||||||
|
|
||||||
# Analyse import fallback blocks. This can be used to support both Python 2 and
|
|
||||||
# 3 compatible code, which means that the block might have code that exists
|
|
||||||
# only in one or another interpreter, leading to false positives when analysed.
|
|
||||||
analyse-fallback-blocks=no
|
|
||||||
|
|
||||||
|
|
||||||
[CLASSES]
|
|
||||||
|
|
||||||
# List of method names used to declare (i.e. assign) instance attributes.
|
|
||||||
defining-attr-methods=__init__,
|
|
||||||
__new__,
|
|
||||||
setUp
|
|
||||||
|
|
||||||
# List of member names, which should be excluded from the protected access
|
|
||||||
# warning.
|
|
||||||
exclude-protected=_asdict,
|
|
||||||
_fields,
|
|
||||||
_replace,
|
|
||||||
_source,
|
|
||||||
_make
|
|
||||||
|
|
||||||
# List of valid names for the first argument in a class method.
|
|
||||||
valid-classmethod-first-arg=cls,
|
|
||||||
class_
|
|
||||||
|
|
||||||
# List of valid names for the first argument in a metaclass class method.
|
|
||||||
valid-metaclass-classmethod-first-arg=mcs
|
|
||||||
|
|
||||||
|
|
||||||
[EXCEPTIONS]
|
|
||||||
|
|
||||||
# Exceptions that will emit a warning when being caught. Defaults to
|
|
||||||
# "Exception"
|
|
||||||
overgeneral-exceptions=StandardError,
|
|
||||||
Exception,
|
|
||||||
BaseException
|
|
||||||
77
Dockerfile
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
|
||||||
|
|
||||||
|
RUN apt-get update -y \
|
||||||
|
&& apt-get install -y python3-pip
|
||||||
|
|
||||||
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
# install build and runtime dependencies
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
# install development dependencies
|
||||||
|
COPY requirements-dev.txt requirements-dev.txt
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install -r requirements-dev.txt
|
||||||
|
|
||||||
|
# image to build pytorch extensions
|
||||||
|
FROM dev AS build
|
||||||
|
|
||||||
|
# install build dependencies
|
||||||
|
COPY requirements-build.txt requirements-build.txt
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install -r requirements-build.txt
|
||||||
|
|
||||||
|
# copy input files
|
||||||
|
COPY csrc csrc
|
||||||
|
COPY setup.py setup.py
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
COPY pyproject.toml pyproject.toml
|
||||||
|
COPY vllm/__init__.py vllm/__init__.py
|
||||||
|
|
||||||
|
# max jobs used by Ninja to build extensions
|
||||||
|
ENV MAX_JOBS=$max_jobs
|
||||||
|
RUN python3 setup.py build_ext --inplace
|
||||||
|
|
||||||
|
# image to run unit testing suite
|
||||||
|
FROM dev AS test
|
||||||
|
|
||||||
|
# copy pytorch extensions separately to avoid having to rebuild
|
||||||
|
# when python code changes
|
||||||
|
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
|
||||||
|
COPY tests tests
|
||||||
|
COPY vllm vllm
|
||||||
|
|
||||||
|
ENTRYPOINT ["python3", "-m", "pytest", "tests"]
|
||||||
|
|
||||||
|
# use CUDA base as CUDA runtime dependencies are already installed via pip
|
||||||
|
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
|
||||||
|
|
||||||
|
# libnccl required for ray
|
||||||
|
RUN apt-get update -y \
|
||||||
|
&& apt-get install -y python3-pip
|
||||||
|
|
||||||
|
WORKDIR /workspace
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
FROM vllm-base AS vllm
|
||||||
|
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
|
||||||
|
COPY vllm vllm
|
||||||
|
|
||||||
|
EXPOSE 8000
|
||||||
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"]
|
||||||
|
|
||||||
|
# openai api server alternative
|
||||||
|
FROM vllm-base AS vllm-openai
|
||||||
|
# install additional dependencies for openai api server
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install accelerate fschat
|
||||||
|
|
||||||
|
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
|
||||||
|
COPY vllm vllm
|
||||||
|
|
||||||
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
|
|
||||||
59
README.md
@@ -10,13 +10,16 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
</h3>
|
</h3>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
| <a href="https://vllm.readthedocs.io/en/latest/"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://github.com/vllm-project/vllm/discussions"><b>Discussions</b></a> |
|
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> |
|
||||||
|
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
|
||||||
|
- [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
|
||||||
|
- [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
|
||||||
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
|
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
|
||||||
- [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
|
- [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
|
||||||
- [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
|
- [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
|
||||||
@@ -35,17 +38,18 @@ vLLM is fast with:
|
|||||||
|
|
||||||
vLLM is flexible and easy to use with:
|
vLLM is flexible and easy to use with:
|
||||||
|
|
||||||
- Seamless integration with popular HuggingFace models
|
- Seamless integration with popular Hugging Face models
|
||||||
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
|
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
|
||||||
- Tensor parallelism support for distributed inference
|
- Tensor parallelism support for distributed inference
|
||||||
- Streaming outputs
|
- Streaming outputs
|
||||||
- OpenAI-compatible API server
|
- OpenAI-compatible API server
|
||||||
|
|
||||||
vLLM seamlessly supports many Huggingface models, including the following architectures:
|
vLLM seamlessly supports many Hugging Face models, including the following architectures:
|
||||||
|
|
||||||
- Aquila (`BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
|
- Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
|
||||||
- Baichuan (`baichuan-inc/Baichuan-7B`, `baichuan-inc/Baichuan-13B-Chat`, etc.)
|
- Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.)
|
||||||
- BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
|
- BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
|
||||||
|
- ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
|
||||||
- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
|
- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
|
||||||
- GPT-2 (`gpt2`, `gpt2-xl`, etc.)
|
- GPT-2 (`gpt2`, `gpt2-xl`, etc.)
|
||||||
- GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
|
- GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
|
||||||
@@ -53,9 +57,12 @@ vLLM seamlessly supports many Huggingface models, including the following archit
|
|||||||
- GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
|
- GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
|
||||||
- InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
|
- InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
|
||||||
- LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
|
- LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
|
||||||
|
- Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
|
||||||
- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
|
- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
|
||||||
- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
|
- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
|
||||||
|
- Phi-1.5 (`microsoft/phi-1_5`, etc.)
|
||||||
- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
|
- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
|
||||||
|
- Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
|
||||||
|
|
||||||
Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
|
Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
|
||||||
|
|
||||||
@@ -70,37 +77,19 @@ Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started
|
|||||||
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
|
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
|
||||||
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
|
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
|
||||||
|
|
||||||
## Performance
|
|
||||||
|
|
||||||
vLLM outperforms HuggingFace Transformers (HF) by up to 24x and Text Generation Inference (TGI) by up to 3.5x, in terms of throughput.
|
|
||||||
For details, check out our [blog post](https://vllm.ai).
|
|
||||||
|
|
||||||
<p align="center">
|
|
||||||
<picture>
|
|
||||||
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a10g_n1_dark.png">
|
|
||||||
<img src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a10g_n1_light.png" width="45%">
|
|
||||||
</picture>
|
|
||||||
<picture>
|
|
||||||
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a100_n1_dark.png">
|
|
||||||
<img src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a100_n1_light.png" width="45%">
|
|
||||||
</picture>
|
|
||||||
<br>
|
|
||||||
<em> Serving throughput when each request asks for 1 output completion. </em>
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<p align="center">
|
|
||||||
<picture>
|
|
||||||
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a10g_n3_dark.png">
|
|
||||||
<img src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a10g_n3_light.png" width="45%">
|
|
||||||
</picture>
|
|
||||||
<picture>
|
|
||||||
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a100_n3_dark.png">
|
|
||||||
<img src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a100_n3_light.png" width="45%">
|
|
||||||
</picture> <br>
|
|
||||||
<em> Serving throughput when each request asks for 3 output completions. </em>
|
|
||||||
</p>
|
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
We welcome and value any contributions and collaborations.
|
We welcome and value any contributions and collaborations.
|
||||||
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
|
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
|
||||||
|
|
||||||
|
## Citation
|
||||||
|
|
||||||
|
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
|
||||||
|
```bibtex
|
||||||
|
@inproceedings{kwon2023efficient,
|
||||||
|
title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
|
||||||
|
author={Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and Hao Zhang and Ion Stoica},
|
||||||
|
booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles},
|
||||||
|
year={2023}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|||||||
@@ -12,16 +12,15 @@ from vllm import LLM, SamplingParams
|
|||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
print(args)
|
print(args)
|
||||||
|
|
||||||
# Process all the requests in a single batch if possible.
|
|
||||||
# NOTE(woosuk): If the request cannot be processed in a single batch,
|
# NOTE(woosuk): If the request cannot be processed in a single batch,
|
||||||
# the engine will automatically process the request in multiple batches.
|
# the engine will automatically process the request in multiple batches.
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
model=args.model,
|
model=args.model,
|
||||||
tokenizer=args.tokenizer,
|
tokenizer=args.tokenizer,
|
||||||
|
quantization=args.quantization,
|
||||||
tensor_parallel_size=args.tensor_parallel_size,
|
tensor_parallel_size=args.tensor_parallel_size,
|
||||||
max_num_seqs=args.batch_size,
|
|
||||||
max_num_batched_tokens=args.batch_size * args.input_len,
|
|
||||||
trust_remote_code=args.trust_remote_code,
|
trust_remote_code=args.trust_remote_code,
|
||||||
|
dtype=args.dtype,
|
||||||
)
|
)
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
@@ -37,22 +36,31 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
def run_to_completion(profile: bool = False):
|
def run_to_completion(profile: bool = False):
|
||||||
if profile:
|
if profile:
|
||||||
torch.cuda.cudart().cudaProfilerStart()
|
with torch.profiler.profile(activities=[
|
||||||
start_time = time.time()
|
torch.profiler.ProfilerActivity.CPU,
|
||||||
|
torch.profiler.ProfilerActivity.CUDA,
|
||||||
llm.generate(prompt_token_ids=dummy_prompt_token_ids,
|
]) as p:
|
||||||
sampling_params=sampling_params,
|
llm.generate(prompt_token_ids=dummy_prompt_token_ids,
|
||||||
use_tqdm=False)
|
sampling_params=sampling_params,
|
||||||
|
use_tqdm=False)
|
||||||
end_time = time.time()
|
print(p.key_averages())
|
||||||
latency = end_time - start_time
|
else:
|
||||||
if profile:
|
start_time = time.perf_counter()
|
||||||
torch.cuda.cudart().cudaProfilerStop()
|
llm.generate(prompt_token_ids=dummy_prompt_token_ids,
|
||||||
return latency
|
sampling_params=sampling_params,
|
||||||
|
use_tqdm=False)
|
||||||
|
end_time = time.perf_counter()
|
||||||
|
latency = end_time - start_time
|
||||||
|
return latency
|
||||||
|
|
||||||
print("Warming up...")
|
print("Warming up...")
|
||||||
run_to_completion(profile=False)
|
run_to_completion(profile=False)
|
||||||
|
|
||||||
|
if args.profile:
|
||||||
|
print("Profiling...")
|
||||||
|
run_to_completion(profile=True)
|
||||||
|
return
|
||||||
|
|
||||||
# Benchmark.
|
# Benchmark.
|
||||||
latencies = []
|
latencies = []
|
||||||
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
|
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
|
||||||
@@ -63,19 +71,41 @@ def main(args: argparse.Namespace):
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description='Benchmark the latency of processing a single batch of '
|
description='Benchmark the latency of processing a single batch of '
|
||||||
'requests till completion.')
|
'requests till completion.')
|
||||||
parser.add_argument('--model', type=str, default='facebook/opt-125m')
|
parser.add_argument('--model', type=str, default='facebook/opt-125m')
|
||||||
parser.add_argument('--tokenizer', type=str, default=None)
|
parser.add_argument('--tokenizer', type=str, default=None)
|
||||||
|
parser.add_argument('--quantization',
|
||||||
|
'-q',
|
||||||
|
choices=['awq', 'squeezellm', None],
|
||||||
|
default=None)
|
||||||
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
|
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
|
||||||
parser.add_argument('--input-len', type=int, default=32)
|
parser.add_argument('--input-len', type=int, default=32)
|
||||||
parser.add_argument('--output-len', type=int, default=128)
|
parser.add_argument('--output-len', type=int, default=128)
|
||||||
parser.add_argument('--batch-size', type=int, default=8)
|
parser.add_argument('--batch-size', type=int, default=8)
|
||||||
parser.add_argument('--n', type=int, default=1,
|
parser.add_argument('--n',
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
help='Number of generated sequences per prompt.')
|
help='Number of generated sequences per prompt.')
|
||||||
parser.add_argument('--use-beam-search', action='store_true')
|
parser.add_argument('--use-beam-search', action='store_true')
|
||||||
parser.add_argument('--num-iters', type=int, default=3,
|
parser.add_argument('--num-iters',
|
||||||
|
type=int,
|
||||||
|
default=3,
|
||||||
help='Number of iterations to run.')
|
help='Number of iterations to run.')
|
||||||
parser.add_argument('--trust-remote-code', action='store_true',
|
parser.add_argument('--trust-remote-code',
|
||||||
|
action='store_true',
|
||||||
help='trust remote code from huggingface')
|
help='trust remote code from huggingface')
|
||||||
|
parser.add_argument(
|
||||||
|
'--dtype',
|
||||||
|
type=str,
|
||||||
|
default='auto',
|
||||||
|
choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
|
||||||
|
help='data type for model weights and activations. '
|
||||||
|
'The "auto" option will use FP16 precision '
|
||||||
|
'for FP32 and FP16 models, and BF16 precision '
|
||||||
|
'for BF16 models.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--profile',
|
||||||
|
action='store_true',
|
||||||
|
help='profile the generation process of a single batch')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@@ -105,7 +105,7 @@ async def send_request(
|
|||||||
best_of: int,
|
best_of: int,
|
||||||
use_beam_search: bool,
|
use_beam_search: bool,
|
||||||
) -> None:
|
) -> None:
|
||||||
request_start_time = time.time()
|
request_start_time = time.perf_counter()
|
||||||
|
|
||||||
headers = {"User-Agent": "Benchmark Client"}
|
headers = {"User-Agent": "Benchmark Client"}
|
||||||
if backend == "vllm":
|
if backend == "vllm":
|
||||||
@@ -148,7 +148,7 @@ async def send_request(
|
|||||||
if "error" not in output:
|
if "error" not in output:
|
||||||
break
|
break
|
||||||
|
|
||||||
request_end_time = time.time()
|
request_end_time = time.perf_counter()
|
||||||
request_latency = request_end_time - request_start_time
|
request_latency = request_end_time - request_start_time
|
||||||
REQUEST_LATENCY.append((prompt_len, output_len, request_latency))
|
REQUEST_LATENCY.append((prompt_len, output_len, request_latency))
|
||||||
|
|
||||||
@@ -180,10 +180,10 @@ def main(args: argparse.Namespace):
|
|||||||
tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
|
tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
|
||||||
input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
|
input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
|
||||||
|
|
||||||
benchmark_start_time = time.time()
|
benchmark_start_time = time.perf_counter()
|
||||||
asyncio.run(benchmark(args.backend, api_url, input_requests, args.best_of,
|
asyncio.run(benchmark(args.backend, api_url, input_requests, args.best_of,
|
||||||
args.use_beam_search, args.request_rate))
|
args.use_beam_search, args.request_rate))
|
||||||
benchmark_end_time = time.time()
|
benchmark_end_time = time.perf_counter()
|
||||||
benchmark_time = benchmark_end_time - benchmark_start_time
|
benchmark_time = benchmark_end_time - benchmark_start_time
|
||||||
print(f"Total time: {benchmark_time:.2f} s")
|
print(f"Total time: {benchmark_time:.2f} s")
|
||||||
print(f"Throughput: {args.num_prompts / benchmark_time:.2f} requests/s")
|
print(f"Throughput: {args.num_prompts / benchmark_time:.2f} requests/s")
|
||||||
|
|||||||
@@ -3,34 +3,31 @@ import argparse
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from typing import List, Tuple
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from transformers import AutoModelForCausalLM, PreTrainedTokenizerBase
|
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
||||||
|
PreTrainedTokenizerBase)
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
|
||||||
|
|
||||||
|
|
||||||
def sample_requests(
|
def sample_requests(
|
||||||
dataset_path: str,
|
dataset_path: str,
|
||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
fixed_output_len: Optional[int],
|
||||||
) -> List[Tuple[str, int, int]]:
|
) -> List[Tuple[str, int, int]]:
|
||||||
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
|
raise ValueError("output_len too small")
|
||||||
|
|
||||||
# Load the dataset.
|
# Load the dataset.
|
||||||
with open(dataset_path) as f:
|
with open(dataset_path) as f:
|
||||||
dataset = json.load(f)
|
dataset = json.load(f)
|
||||||
# Filter out the conversations with less than 2 turns.
|
# Filter out the conversations with less than 2 turns.
|
||||||
dataset = [
|
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
||||||
data for data in dataset
|
|
||||||
if len(data["conversations"]) >= 2
|
|
||||||
]
|
|
||||||
# Only keep the first two turns of each conversation.
|
# Only keep the first two turns of each conversation.
|
||||||
dataset = [
|
dataset = [(data["conversations"][0]["value"],
|
||||||
(data["conversations"][0]["value"], data["conversations"][1]["value"])
|
data["conversations"][1]["value"]) for data in dataset]
|
||||||
for data in dataset
|
|
||||||
]
|
|
||||||
|
|
||||||
# Tokenize the prompts and completions.
|
# Tokenize the prompts and completions.
|
||||||
prompts = [prompt for prompt, _ in dataset]
|
prompts = [prompt for prompt, _ in dataset]
|
||||||
@@ -40,6 +37,8 @@ def sample_requests(
|
|||||||
tokenized_dataset = []
|
tokenized_dataset = []
|
||||||
for i in range(len(dataset)):
|
for i in range(len(dataset)):
|
||||||
output_len = len(completion_token_ids[i])
|
output_len = len(completion_token_ids[i])
|
||||||
|
if fixed_output_len is not None:
|
||||||
|
output_len = fixed_output_len
|
||||||
tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
|
tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
|
||||||
|
|
||||||
# Filter out too long sequences.
|
# Filter out too long sequences.
|
||||||
@@ -63,18 +62,25 @@ def run_vllm(
|
|||||||
requests: List[Tuple[str, int, int]],
|
requests: List[Tuple[str, int, int]],
|
||||||
model: str,
|
model: str,
|
||||||
tokenizer: str,
|
tokenizer: str,
|
||||||
|
quantization: Optional[str],
|
||||||
tensor_parallel_size: int,
|
tensor_parallel_size: int,
|
||||||
seed: int,
|
seed: int,
|
||||||
n: int,
|
n: int,
|
||||||
use_beam_search: bool,
|
use_beam_search: bool,
|
||||||
trust_remote_code: bool,
|
trust_remote_code: bool,
|
||||||
|
dtype: str,
|
||||||
|
max_model_len: Optional[int] = None,
|
||||||
) -> float:
|
) -> float:
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
model=model,
|
model=model,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
|
quantization=quantization,
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
seed=seed,
|
seed=seed,
|
||||||
trust_remote_code=trust_remote_code,
|
trust_remote_code=trust_remote_code,
|
||||||
|
dtype=dtype,
|
||||||
|
max_model_len=max_model_len,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add the requests to the engine.
|
# Add the requests to the engine.
|
||||||
@@ -94,10 +100,10 @@ def run_vllm(
|
|||||||
sampling_params=sampling_params,
|
sampling_params=sampling_params,
|
||||||
)
|
)
|
||||||
|
|
||||||
start = time.time()
|
start = time.perf_counter()
|
||||||
# FIXME(woosuk): Do use internal method.
|
# FIXME(woosuk): Do not use internal method.
|
||||||
llm._run_engine(use_tqdm=True)
|
llm._run_engine(use_tqdm=True)
|
||||||
end = time.time()
|
end = time.perf_counter()
|
||||||
return end - start
|
return end - start
|
||||||
|
|
||||||
|
|
||||||
@@ -111,15 +117,15 @@ def run_hf(
|
|||||||
trust_remote_code: bool,
|
trust_remote_code: bool,
|
||||||
) -> float:
|
) -> float:
|
||||||
assert not use_beam_search
|
assert not use_beam_search
|
||||||
llm = AutoModelForCausalLM.from_pretrained(model,
|
llm = AutoModelForCausalLM.from_pretrained(
|
||||||
torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
|
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
|
||||||
if llm.config.model_type == "llama":
|
if llm.config.model_type == "llama":
|
||||||
# To enable padding in the HF backend.
|
# To enable padding in the HF backend.
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
llm = llm.cuda()
|
llm = llm.cuda()
|
||||||
|
|
||||||
pbar = tqdm(total=len(requests))
|
pbar = tqdm(total=len(requests))
|
||||||
start = time.time()
|
start = time.perf_counter()
|
||||||
batch: List[str] = []
|
batch: List[str] = []
|
||||||
max_prompt_len = 0
|
max_prompt_len = 0
|
||||||
max_output_len = 0
|
max_output_len = 0
|
||||||
@@ -132,13 +138,14 @@ def run_hf(
|
|||||||
if len(batch) < max_batch_size and i != len(requests) - 1:
|
if len(batch) < max_batch_size and i != len(requests) - 1:
|
||||||
# Check if we can add more requests to the batch.
|
# Check if we can add more requests to the batch.
|
||||||
_, next_prompt_len, next_output_len = requests[i + 1]
|
_, next_prompt_len, next_output_len = requests[i + 1]
|
||||||
if (max(max_prompt_len, next_prompt_len) + max(
|
if (max(max_prompt_len, next_prompt_len) +
|
||||||
max_output_len, next_output_len)) <= 2048:
|
max(max_output_len, next_output_len)) <= 2048:
|
||||||
# We can add more requests to the batch.
|
# We can add more requests to the batch.
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Generate the sequences.
|
# Generate the sequences.
|
||||||
input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
|
input_ids = tokenizer(batch, return_tensors="pt",
|
||||||
|
padding=True).input_ids
|
||||||
llm_outputs = llm.generate(
|
llm_outputs = llm.generate(
|
||||||
input_ids=input_ids.cuda(),
|
input_ids=input_ids.cuda(),
|
||||||
do_sample=not use_beam_search,
|
do_sample=not use_beam_search,
|
||||||
@@ -156,7 +163,23 @@ def run_hf(
|
|||||||
batch = []
|
batch = []
|
||||||
max_prompt_len = 0
|
max_prompt_len = 0
|
||||||
max_output_len = 0
|
max_output_len = 0
|
||||||
end = time.time()
|
end = time.perf_counter()
|
||||||
|
return end - start
|
||||||
|
|
||||||
|
|
||||||
|
def run_mii(
|
||||||
|
requests: List[Tuple[str, int, int]],
|
||||||
|
model: str,
|
||||||
|
tensor_parallel_size: int,
|
||||||
|
output_len: int,
|
||||||
|
) -> float:
|
||||||
|
from mii import pipeline
|
||||||
|
llm = pipeline(model, tensor_parallel=tensor_parallel_size)
|
||||||
|
prompts = [prompt for prompt, _, _ in requests]
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
llm(prompts, max_new_tokens=output_len)
|
||||||
|
end = time.perf_counter()
|
||||||
return end - start
|
return end - start
|
||||||
|
|
||||||
|
|
||||||
@@ -165,49 +188,105 @@ def main(args: argparse.Namespace):
|
|||||||
random.seed(args.seed)
|
random.seed(args.seed)
|
||||||
|
|
||||||
# Sample the requests.
|
# Sample the requests.
|
||||||
tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
|
args.tokenizer, trust_remote_code=args.trust_remote_code)
|
||||||
|
if args.dataset is None:
|
||||||
|
# Synthesize a prompt with the given input length.
|
||||||
|
prompt = "hi" * (args.input_len - 1)
|
||||||
|
requests = [(prompt, args.input_len, args.output_len)
|
||||||
|
for _ in range(args.num_prompts)]
|
||||||
|
else:
|
||||||
|
requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
|
||||||
|
args.output_len)
|
||||||
|
|
||||||
if args.backend == "vllm":
|
if args.backend == "vllm":
|
||||||
elapsed_time = run_vllm(
|
elapsed_time = run_vllm(requests, args.model, args.tokenizer,
|
||||||
requests, args.model, args.tokenizer, args.tensor_parallel_size,
|
args.quantization, args.tensor_parallel_size,
|
||||||
args.seed, args.n, args.use_beam_search, args.trust_remote_code)
|
args.seed, args.n, args.use_beam_search,
|
||||||
|
args.trust_remote_code, args.dtype,
|
||||||
|
args.max_model_len)
|
||||||
elif args.backend == "hf":
|
elif args.backend == "hf":
|
||||||
assert args.tensor_parallel_size == 1
|
assert args.tensor_parallel_size == 1
|
||||||
elapsed_time = run_hf(
|
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
||||||
requests, args.model, tokenizer, args.n, args.use_beam_search,
|
args.use_beam_search, args.hf_max_batch_size,
|
||||||
args.hf_max_batch_size, args.trust_remote_code)
|
args.trust_remote_code)
|
||||||
|
elif args.backend == "mii":
|
||||||
|
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
|
||||||
|
args.output_len)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown backend: {args.backend}")
|
raise ValueError(f"Unknown backend: {args.backend}")
|
||||||
total_num_tokens = sum(
|
total_num_tokens = sum(prompt_len + output_len
|
||||||
prompt_len + output_len
|
for _, prompt_len, output_len in requests)
|
||||||
for _, prompt_len, output_len in requests
|
|
||||||
)
|
|
||||||
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
||||||
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
|
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(description="Benchmark the throughput.")
|
parser = argparse.ArgumentParser(description="Benchmark the throughput.")
|
||||||
parser.add_argument("--backend", type=str, choices=["vllm", "hf"],
|
parser.add_argument("--backend",
|
||||||
|
type=str,
|
||||||
|
choices=["vllm", "hf", "mii"],
|
||||||
default="vllm")
|
default="vllm")
|
||||||
parser.add_argument("--dataset", type=str, required=True,
|
parser.add_argument("--dataset",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
help="Path to the dataset.")
|
help="Path to the dataset.")
|
||||||
|
parser.add_argument("--input-len",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Input prompt length for each request")
|
||||||
|
parser.add_argument("--output-len",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Output length for each request. Overrides the "
|
||||||
|
"output length from the dataset.")
|
||||||
parser.add_argument("--model", type=str, default="facebook/opt-125m")
|
parser.add_argument("--model", type=str, default="facebook/opt-125m")
|
||||||
parser.add_argument("--tokenizer", type=str, default=None)
|
parser.add_argument("--tokenizer", type=str, default=None)
|
||||||
|
parser.add_argument('--quantization',
|
||||||
|
'-q',
|
||||||
|
choices=['awq', 'squeezellm', None],
|
||||||
|
default=None)
|
||||||
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
|
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
|
||||||
parser.add_argument("--n", type=int, default=1,
|
parser.add_argument("--n",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
help="Number of generated sequences per prompt.")
|
help="Number of generated sequences per prompt.")
|
||||||
parser.add_argument("--use-beam-search", action="store_true")
|
parser.add_argument("--use-beam-search", action="store_true")
|
||||||
parser.add_argument("--num-prompts", type=int, default=1000,
|
parser.add_argument("--num-prompts",
|
||||||
|
type=int,
|
||||||
|
default=1000,
|
||||||
help="Number of prompts to process.")
|
help="Number of prompts to process.")
|
||||||
parser.add_argument("--seed", type=int, default=0)
|
parser.add_argument("--seed", type=int, default=0)
|
||||||
parser.add_argument("--hf-max-batch-size", type=int, default=None,
|
parser.add_argument("--hf-max-batch-size",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
help="Maximum batch size for HF backend.")
|
help="Maximum batch size for HF backend.")
|
||||||
parser.add_argument('--trust-remote-code',
|
parser.add_argument('--trust-remote-code',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help='trust remote code from huggingface')
|
help='trust remote code from huggingface')
|
||||||
|
parser.add_argument(
|
||||||
|
'--max-model-len',
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help='Maximum length of a sequence (including prompt and output). '
|
||||||
|
'If None, will be derived from the model.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--dtype',
|
||||||
|
type=str,
|
||||||
|
default='auto',
|
||||||
|
choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
|
||||||
|
help='data type for model weights and activations. '
|
||||||
|
'The "auto" option will use FP16 precision '
|
||||||
|
'for FP32 and FP16 models, and BF16 precision '
|
||||||
|
'for BF16 models.')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
if args.tokenizer is None:
|
||||||
|
args.tokenizer = args.model
|
||||||
|
if args.dataset is None:
|
||||||
|
assert args.input_len is not None
|
||||||
|
assert args.output_len is not None
|
||||||
|
else:
|
||||||
|
assert args.input_len is None
|
||||||
|
|
||||||
if args.backend == "vllm":
|
if args.backend == "vllm":
|
||||||
if args.hf_max_batch_size is not None:
|
if args.hf_max_batch_size is not None:
|
||||||
@@ -215,7 +294,20 @@ if __name__ == "__main__":
|
|||||||
elif args.backend == "hf":
|
elif args.backend == "hf":
|
||||||
if args.hf_max_batch_size is None:
|
if args.hf_max_batch_size is None:
|
||||||
raise ValueError("HF max batch size is required for HF backend.")
|
raise ValueError("HF max batch size is required for HF backend.")
|
||||||
if args.tokenizer is None:
|
if args.quantization is not None:
|
||||||
args.tokenizer = args.model
|
raise ValueError("Quantization is only for vLLM backend.")
|
||||||
|
elif args.backend == "mii":
|
||||||
|
if args.dtype != "auto":
|
||||||
|
raise ValueError("dtype must be auto for MII backend.")
|
||||||
|
if args.n != 1:
|
||||||
|
raise ValueError("n must be 1 for MII backend.")
|
||||||
|
if args.use_beam_search:
|
||||||
|
raise ValueError("Beam search is not supported for MII backend.")
|
||||||
|
if args.quantization is not None:
|
||||||
|
raise ValueError("Quantization is only for vLLM backend.")
|
||||||
|
if args.hf_max_batch_size is not None:
|
||||||
|
raise ValueError("HF max batch size is only for HF backend.")
|
||||||
|
if args.tokenizer != args.model:
|
||||||
|
raise ValueError("Tokenizer must be the same as the model for MII "
|
||||||
|
"backend.")
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
197
benchmarks/kernels/benchmark_paged_attention.py
Normal file
@@ -0,0 +1,197 @@
|
|||||||
|
import argparse
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm._C import ops
|
||||||
|
|
||||||
|
NUM_BLOCKS = 1024
|
||||||
|
PARTITION_SIZE = 512
|
||||||
|
|
||||||
|
|
||||||
|
@torch.inference_mode()
|
||||||
|
def main(
|
||||||
|
version: str,
|
||||||
|
num_seqs: int,
|
||||||
|
context_len: int,
|
||||||
|
num_query_heads: int,
|
||||||
|
num_kv_heads: int,
|
||||||
|
head_size: int,
|
||||||
|
use_alibi: bool,
|
||||||
|
block_size: int,
|
||||||
|
dtype: torch.dtype,
|
||||||
|
seed: int,
|
||||||
|
do_profile: bool,
|
||||||
|
) -> None:
|
||||||
|
random.seed(seed)
|
||||||
|
torch.random.manual_seed(seed)
|
||||||
|
torch.cuda.manual_seed(seed)
|
||||||
|
|
||||||
|
scale = float(1.0 / (head_size**0.5))
|
||||||
|
query = torch.empty(num_seqs,
|
||||||
|
num_query_heads,
|
||||||
|
head_size,
|
||||||
|
dtype=dtype,
|
||||||
|
device="cuda")
|
||||||
|
query.uniform_(-scale, scale)
|
||||||
|
|
||||||
|
assert num_query_heads % num_kv_heads == 0
|
||||||
|
num_queries_per_kv = num_query_heads // num_kv_heads
|
||||||
|
head_mapping = torch.repeat_interleave(
|
||||||
|
torch.arange(num_kv_heads, dtype=torch.int32, device="cuda"),
|
||||||
|
num_queries_per_kv)
|
||||||
|
alibi_slopes = None
|
||||||
|
if use_alibi:
|
||||||
|
alibi_slopes = torch.randn(num_query_heads,
|
||||||
|
dtype=torch.float,
|
||||||
|
device="cuda")
|
||||||
|
|
||||||
|
context_lens = [context_len for _ in range(num_seqs)]
|
||||||
|
max_context_len = max(context_lens)
|
||||||
|
context_lens = torch.tensor(context_lens, dtype=torch.int, device="cuda")
|
||||||
|
|
||||||
|
# Create the block tables.
|
||||||
|
max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
|
||||||
|
block_tables = []
|
||||||
|
for _ in range(num_seqs):
|
||||||
|
block_table = [
|
||||||
|
random.randint(0, NUM_BLOCKS - 1)
|
||||||
|
for _ in range(max_num_blocks_per_seq)
|
||||||
|
]
|
||||||
|
block_tables.append(block_table)
|
||||||
|
block_tables = torch.tensor(block_tables, dtype=torch.int, device="cuda")
|
||||||
|
|
||||||
|
# Create the KV cache.
|
||||||
|
x = 16 // torch.tensor([], dtype=dtype).element_size()
|
||||||
|
key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x, block_size, x)
|
||||||
|
key_cache = torch.empty(size=key_cache_shape, dtype=dtype, device="cuda")
|
||||||
|
key_cache.uniform_(-scale, scale)
|
||||||
|
value_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size, block_size)
|
||||||
|
value_cache = torch.empty(size=value_cache_shape,
|
||||||
|
dtype=dtype,
|
||||||
|
device="cuda")
|
||||||
|
value_cache.uniform_(-scale, scale)
|
||||||
|
|
||||||
|
# Prepare for the paged attention kernel.
|
||||||
|
output = torch.empty_like(query)
|
||||||
|
if version == "v2":
|
||||||
|
num_partitions = ((max_context_len + PARTITION_SIZE - 1) //
|
||||||
|
PARTITION_SIZE)
|
||||||
|
tmp_output = torch.empty(
|
||||||
|
size=(num_seqs, num_query_heads, num_partitions, head_size),
|
||||||
|
dtype=output.dtype,
|
||||||
|
device=output.device,
|
||||||
|
)
|
||||||
|
exp_sums = torch.empty(
|
||||||
|
size=(num_seqs, num_query_heads, num_partitions),
|
||||||
|
dtype=torch.float32,
|
||||||
|
device=output.device,
|
||||||
|
)
|
||||||
|
max_logits = torch.empty_like(exp_sums)
|
||||||
|
|
||||||
|
def run_benchmark(num_iters: int, profile: bool = False) -> float:
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
if profile:
|
||||||
|
torch.cuda.cudart().cudaProfilerStart()
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
|
||||||
|
for _ in range(num_iters):
|
||||||
|
if version == "v1":
|
||||||
|
ops.paged_attention_v1(
|
||||||
|
output,
|
||||||
|
query,
|
||||||
|
key_cache,
|
||||||
|
value_cache,
|
||||||
|
head_mapping,
|
||||||
|
scale,
|
||||||
|
block_tables,
|
||||||
|
context_lens,
|
||||||
|
block_size,
|
||||||
|
max_context_len,
|
||||||
|
alibi_slopes,
|
||||||
|
)
|
||||||
|
elif version == "v2":
|
||||||
|
ops.paged_attention_v2(
|
||||||
|
output,
|
||||||
|
exp_sums,
|
||||||
|
max_logits,
|
||||||
|
tmp_output,
|
||||||
|
query,
|
||||||
|
key_cache,
|
||||||
|
value_cache,
|
||||||
|
head_mapping,
|
||||||
|
scale,
|
||||||
|
block_tables,
|
||||||
|
context_lens,
|
||||||
|
block_size,
|
||||||
|
max_context_len,
|
||||||
|
alibi_slopes,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid version: {version}")
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
end_time = time.perf_counter()
|
||||||
|
if profile:
|
||||||
|
torch.cuda.cudart().cudaProfilerStart()
|
||||||
|
return (end_time - start_time) / num_iters
|
||||||
|
|
||||||
|
# Warmup.
|
||||||
|
print("Warming up...")
|
||||||
|
run_benchmark(num_iters=3, profile=False)
|
||||||
|
|
||||||
|
# Benchmark.
|
||||||
|
if do_profile:
|
||||||
|
latency = run_benchmark(num_iters=1, profile=True)
|
||||||
|
else:
|
||||||
|
latency = run_benchmark(num_iters=100, profile=False)
|
||||||
|
print(f"Kernel running time: {latency * 1000000:.3f} us")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Benchmark the paged attention kernel.")
|
||||||
|
parser.add_argument("--version",
|
||||||
|
type=str,
|
||||||
|
choices=["v1", "v2"],
|
||||||
|
default="v2")
|
||||||
|
parser.add_argument("--batch-size", type=int, default=8)
|
||||||
|
parser.add_argument("--context-len", type=int, default=4096)
|
||||||
|
parser.add_argument("--num-query-heads", type=int, default=64)
|
||||||
|
parser.add_argument("--num-kv-heads", type=int, default=8)
|
||||||
|
parser.add_argument("--head-size",
|
||||||
|
type=int,
|
||||||
|
choices=[64, 80, 96, 112, 128, 256],
|
||||||
|
default=128)
|
||||||
|
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
|
||||||
|
parser.add_argument("--use-alibi", action="store_true")
|
||||||
|
parser.add_argument("--dtype",
|
||||||
|
type=str,
|
||||||
|
choices=["half", "bfloat16", "float"],
|
||||||
|
default="half")
|
||||||
|
parser.add_argument("--seed", type=int, default=0)
|
||||||
|
parser.add_argument("--profile", action="store_true")
|
||||||
|
args = parser.parse_args()
|
||||||
|
print(args)
|
||||||
|
|
||||||
|
if args.num_query_heads % args.num_kv_heads != 0:
|
||||||
|
raise ValueError("num_query_heads must be divisible by num_kv_heads")
|
||||||
|
dtype_to_torch_dtype = {
|
||||||
|
"half": torch.half,
|
||||||
|
"bfloat16": torch.bfloat16,
|
||||||
|
"float": torch.float,
|
||||||
|
}
|
||||||
|
main(
|
||||||
|
version=args.version,
|
||||||
|
num_seqs=args.batch_size,
|
||||||
|
context_len=args.context_len,
|
||||||
|
num_query_heads=args.num_query_heads,
|
||||||
|
num_kv_heads=args.num_kv_heads,
|
||||||
|
head_size=args.head_size,
|
||||||
|
block_size=args.block_size,
|
||||||
|
use_alibi=args.use_alibi,
|
||||||
|
dtype=dtype_to_torch_dtype[args.dtype],
|
||||||
|
seed=args.seed,
|
||||||
|
do_profile=args.profile,
|
||||||
|
)
|
||||||
@@ -1,28 +0,0 @@
|
|||||||
#include <torch/extension.h>
|
|
||||||
|
|
||||||
void silu_and_mul(
|
|
||||||
torch::Tensor& out,
|
|
||||||
torch::Tensor& input);
|
|
||||||
|
|
||||||
void gelu_new(
|
|
||||||
torch::Tensor& out,
|
|
||||||
torch::Tensor& input);
|
|
||||||
|
|
||||||
void gelu_fast(
|
|
||||||
torch::Tensor& out,
|
|
||||||
torch::Tensor& input);
|
|
||||||
|
|
||||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
|
||||||
m.def(
|
|
||||||
"silu_and_mul",
|
|
||||||
&silu_and_mul,
|
|
||||||
"Activation function used in SwiGLU.");
|
|
||||||
m.def(
|
|
||||||
"gelu_new",
|
|
||||||
&gelu_new,
|
|
||||||
"GELU implementation used in GPT-2.");
|
|
||||||
m.def(
|
|
||||||
"gelu_fast",
|
|
||||||
&gelu_fast,
|
|
||||||
"Approximate GELU implementation.");
|
|
||||||
}
|
|
||||||
@@ -13,11 +13,11 @@ __device__ __forceinline__ T silu(const T& x) {
|
|||||||
|
|
||||||
template<typename scalar_t>
|
template<typename scalar_t>
|
||||||
__global__ void silu_and_mul_kernel(
|
__global__ void silu_and_mul_kernel(
|
||||||
scalar_t* __restrict__ out, // [num_tokens, d]
|
scalar_t* __restrict__ out, // [..., d]
|
||||||
const scalar_t* __restrict__ input, // [num_tokens, 2, d]
|
const scalar_t* __restrict__ input, // [..., 2, d]
|
||||||
const int d) {
|
const int d) {
|
||||||
const int token_idx = blockIdx.x;
|
const int64_t token_idx = blockIdx.x;
|
||||||
for (int idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
||||||
const scalar_t x = __ldg(&input[token_idx * 2 * d + idx]);
|
const scalar_t x = __ldg(&input[token_idx * 2 * d + idx]);
|
||||||
const scalar_t y = __ldg(&input[token_idx * 2 * d + d + idx]);
|
const scalar_t y = __ldg(&input[token_idx * 2 * d + d + idx]);
|
||||||
out[token_idx * d + idx] = silu(x) * y;
|
out[token_idx * d + idx] = silu(x) * y;
|
||||||
@@ -27,11 +27,11 @@ __global__ void silu_and_mul_kernel(
|
|||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
void silu_and_mul(
|
void silu_and_mul(
|
||||||
torch::Tensor& out, // [num_tokens, d]
|
torch::Tensor& out, // [..., d]
|
||||||
torch::Tensor& input) // [num_tokens, 2 * d]
|
torch::Tensor& input) // [..., 2 * d]
|
||||||
{
|
{
|
||||||
int num_tokens = input.size(0);
|
int64_t num_tokens = input.numel() / input.size(-1);
|
||||||
int d = input.size(1) / 2;
|
int d = input.size(-1) / 2;
|
||||||
|
|
||||||
dim3 grid(num_tokens);
|
dim3 grid(num_tokens);
|
||||||
dim3 block(std::min(d, 1024));
|
dim3 block(std::min(d, 1024));
|
||||||
@@ -52,11 +52,11 @@ namespace vllm {
|
|||||||
// Element-wise activation kernel template.
|
// Element-wise activation kernel template.
|
||||||
template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
|
template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
|
||||||
__global__ void activation_kernel(
|
__global__ void activation_kernel(
|
||||||
scalar_t* __restrict__ out, // [num_tokens, d]
|
scalar_t* __restrict__ out, // [..., d]
|
||||||
const scalar_t* __restrict__ input, // [num_tokens, d]
|
const scalar_t* __restrict__ input, // [..., d]
|
||||||
const int d) {
|
const int d) {
|
||||||
const int token_idx = blockIdx.x;
|
const int64_t token_idx = blockIdx.x;
|
||||||
for (int idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
||||||
const scalar_t x = __ldg(&input[token_idx * d + idx]);
|
const scalar_t x = __ldg(&input[token_idx * d + idx]);
|
||||||
out[token_idx * d + idx] = ACT_FN(x);
|
out[token_idx * d + idx] = ACT_FN(x);
|
||||||
}
|
}
|
||||||
@@ -66,8 +66,8 @@ __global__ void activation_kernel(
|
|||||||
|
|
||||||
// Launch element-wise activation kernel.
|
// Launch element-wise activation kernel.
|
||||||
#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \
|
#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \
|
||||||
int num_tokens = input.size(0); \
|
int d = input.size(-1); \
|
||||||
int d = input.size(1); \
|
int64_t num_tokens = input.numel() / d; \
|
||||||
dim3 grid(num_tokens); \
|
dim3 grid(num_tokens); \
|
||||||
dim3 block(std::min(d, 1024)); \
|
dim3 block(std::min(d, 1024)); \
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
|
||||||
@@ -100,15 +100,15 @@ __device__ __forceinline__ T gelu_fast_kernel(const T& x) {
|
|||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
void gelu_new(
|
void gelu_new(
|
||||||
torch::Tensor& out, // [num_tokens, d]
|
torch::Tensor& out, // [..., d]
|
||||||
torch::Tensor& input) // [num_tokens, d]
|
torch::Tensor& input) // [..., d]
|
||||||
{
|
{
|
||||||
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
|
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
|
||||||
}
|
}
|
||||||
|
|
||||||
void gelu_fast(
|
void gelu_fast(
|
||||||
torch::Tensor& out, // [num_tokens, d]
|
torch::Tensor& out, // [..., d]
|
||||||
torch::Tensor& input) // [num_tokens, d]
|
torch::Tensor& input) // [..., d]
|
||||||
{
|
{
|
||||||
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
|
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,22 +0,0 @@
|
|||||||
#include <torch/extension.h>
|
|
||||||
#include <c10/util/Optional.h>
|
|
||||||
|
|
||||||
void single_query_cached_kv_attention(
|
|
||||||
torch::Tensor& out,
|
|
||||||
torch::Tensor& query,
|
|
||||||
torch::Tensor& key_cache,
|
|
||||||
torch::Tensor& value_cache,
|
|
||||||
torch::Tensor& head_mapping,
|
|
||||||
float scale,
|
|
||||||
torch::Tensor& block_tables,
|
|
||||||
torch::Tensor& context_lens,
|
|
||||||
int block_size,
|
|
||||||
int max_context_len,
|
|
||||||
const c10::optional<torch::Tensor>& alibi_slopes);
|
|
||||||
|
|
||||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
|
||||||
m.def(
|
|
||||||
"single_query_cached_kv_attention",
|
|
||||||
&single_query_cached_kv_attention,
|
|
||||||
"Compute the attention between an input query and the cached key/value tensors");
|
|
||||||
}
|
|
||||||
@@ -26,6 +26,7 @@
|
|||||||
#define WARP_SIZE 32
|
#define WARP_SIZE 32
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
|
#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
@@ -65,14 +66,18 @@ inline __device__ float block_sum(float* red_smem, float sum) {
|
|||||||
return __shfl_sync(uint32_t(-1), sum, 0);
|
return __shfl_sync(uint32_t(-1), sum, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Grid: (num_heads, num_seqs).
|
// TODO(woosuk): Merge the last two dimensions of the grid.
|
||||||
|
// Grid: (num_heads, num_seqs, max_num_partitions).
|
||||||
template<
|
template<
|
||||||
typename scalar_t,
|
typename scalar_t,
|
||||||
int HEAD_SIZE,
|
int HEAD_SIZE,
|
||||||
int BLOCK_SIZE,
|
int BLOCK_SIZE,
|
||||||
int NUM_THREADS>
|
int NUM_THREADS,
|
||||||
__global__ void single_query_cached_kv_attention_kernel(
|
int PARTITION_SIZE = 0> // Zero means no partitioning.
|
||||||
scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
|
__device__ void paged_attention_kernel(
|
||||||
|
float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
|
||||||
|
float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
|
||||||
|
scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size]
|
||||||
const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
|
const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
|
||||||
const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x]
|
const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x]
|
||||||
const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size]
|
const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size]
|
||||||
@@ -85,10 +90,33 @@ __global__ void single_query_cached_kv_attention_kernel(
|
|||||||
const int q_stride,
|
const int q_stride,
|
||||||
const int kv_block_stride,
|
const int kv_block_stride,
|
||||||
const int kv_head_stride) {
|
const int kv_head_stride) {
|
||||||
|
const int seq_idx = blockIdx.y;
|
||||||
|
const int partition_idx = blockIdx.z;
|
||||||
|
const int max_num_partitions = gridDim.z;
|
||||||
|
constexpr bool USE_PARTITIONING = PARTITION_SIZE > 0;
|
||||||
|
const int context_len = context_lens[seq_idx];
|
||||||
|
if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= context_len) {
|
||||||
|
// No work to do. Terminate the thread block.
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
|
||||||
|
const int num_blocks_per_partition = USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_context_blocks;
|
||||||
|
|
||||||
|
// [start_block_idx, end_block_idx) is the range of blocks to process.
|
||||||
|
const int start_block_idx = USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0;
|
||||||
|
const int end_block_idx = MIN(start_block_idx + num_blocks_per_partition, num_context_blocks);
|
||||||
|
const int num_blocks = end_block_idx - start_block_idx;
|
||||||
|
|
||||||
|
// [start_token_idx, end_token_idx) is the range of tokens to process.
|
||||||
|
const int start_token_idx = start_block_idx * BLOCK_SIZE;
|
||||||
|
const int end_token_idx = MIN(start_token_idx + num_blocks * BLOCK_SIZE, context_len);
|
||||||
|
const int num_tokens = end_token_idx - start_token_idx;
|
||||||
|
|
||||||
constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
|
constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
|
||||||
constexpr int NUM_THREAD_GROUPS = NUM_THREADS / THREAD_GROUP_SIZE; // Note: This assumes THREAD_GROUP_SIZE divides NUM_THREADS
|
constexpr int NUM_THREAD_GROUPS = NUM_THREADS / THREAD_GROUP_SIZE; // Note: This assumes THREAD_GROUP_SIZE divides NUM_THREADS
|
||||||
assert(NUM_THREADS % THREAD_GROUP_SIZE == 0);
|
assert(NUM_THREADS % THREAD_GROUP_SIZE == 0);
|
||||||
constexpr int NUM_TOKENS_PER_THREAD_GROUP = (BLOCK_SIZE + WARP_SIZE - 1) / WARP_SIZE;
|
constexpr int NUM_TOKENS_PER_THREAD_GROUP = DIVIDE_ROUND_UP(BLOCK_SIZE, WARP_SIZE);
|
||||||
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
||||||
const int thread_idx = threadIdx.x;
|
const int thread_idx = threadIdx.x;
|
||||||
const int warp_idx = thread_idx / WARP_SIZE;
|
const int warp_idx = thread_idx / WARP_SIZE;
|
||||||
@@ -97,7 +125,6 @@ __global__ void single_query_cached_kv_attention_kernel(
|
|||||||
const int head_idx = blockIdx.x;
|
const int head_idx = blockIdx.x;
|
||||||
const int num_heads = gridDim.x;
|
const int num_heads = gridDim.x;
|
||||||
const int kv_head_idx = head_mapping[head_idx];
|
const int kv_head_idx = head_mapping[head_idx];
|
||||||
const int seq_idx = blockIdx.y;
|
|
||||||
const float alibi_slope = alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx];
|
const float alibi_slope = alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx];
|
||||||
|
|
||||||
// A vector type to store a part of a key or a query.
|
// A vector type to store a part of a key or a query.
|
||||||
@@ -142,16 +169,16 @@ __global__ void single_query_cached_kv_attention_kernel(
|
|||||||
constexpr int x = 16 / sizeof(scalar_t);
|
constexpr int x = 16 / sizeof(scalar_t);
|
||||||
float qk_max = -FLT_MAX;
|
float qk_max = -FLT_MAX;
|
||||||
|
|
||||||
const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
|
|
||||||
const int context_len = context_lens[seq_idx];
|
|
||||||
const int num_blocks = (context_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
|
||||||
|
|
||||||
// Iterate over the key blocks.
|
// Iterate over the key blocks.
|
||||||
// Each warp fetches a block of keys for each iteration.
|
// Each warp fetches a block of keys for each iteration.
|
||||||
// Each thread group in a warp fetches a key from the block, and computes
|
// Each thread group in a warp fetches a key from the block, and computes
|
||||||
// dot product with the query.
|
// dot product with the query.
|
||||||
for (int block_idx = warp_idx; block_idx < num_blocks; block_idx += NUM_WARPS) {
|
const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
|
||||||
const int physical_block_number = block_table[block_idx];
|
for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) {
|
||||||
|
// NOTE(woosuk): The block number is stored in int32. However, we cast it to int64
|
||||||
|
// because int32 can lead to overflow when this variable is multiplied by large numbers
|
||||||
|
// (e.g., kv_block_stride).
|
||||||
|
const int64_t physical_block_number = static_cast<int64_t>(block_table[block_idx]);
|
||||||
|
|
||||||
// Load a key to registers.
|
// Load a key to registers.
|
||||||
// Each thread in a thread group has a different part of the key.
|
// Each thread in a thread group has a different part of the key.
|
||||||
@@ -184,7 +211,7 @@ __global__ void single_query_cached_kv_attention_kernel(
|
|||||||
// Store the partial reductions to shared memory.
|
// Store the partial reductions to shared memory.
|
||||||
// NOTE(woosuk): It is required to zero out the masked logits.
|
// NOTE(woosuk): It is required to zero out the masked logits.
|
||||||
const bool mask = token_idx >= context_len;
|
const bool mask = token_idx >= context_len;
|
||||||
logits[token_idx] = mask ? 0.f : qk;
|
logits[token_idx - start_token_idx] = mask ? 0.f : qk;
|
||||||
// Update the max value.
|
// Update the max value.
|
||||||
qk_max = mask ? qk_max : fmaxf(qk_max, qk);
|
qk_max = mask ? qk_max : fmaxf(qk_max, qk);
|
||||||
}
|
}
|
||||||
@@ -215,7 +242,7 @@ __global__ void single_query_cached_kv_attention_kernel(
|
|||||||
|
|
||||||
// Get the sum of the exp values.
|
// Get the sum of the exp values.
|
||||||
float exp_sum = 0.f;
|
float exp_sum = 0.f;
|
||||||
for (int i = thread_idx; i < context_len; i += NUM_THREADS) {
|
for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
|
||||||
float val = __expf(logits[i] - qk_max);
|
float val = __expf(logits[i] - qk_max);
|
||||||
logits[i] = val;
|
logits[i] = val;
|
||||||
exp_sum += val;
|
exp_sum += val;
|
||||||
@@ -224,11 +251,23 @@ __global__ void single_query_cached_kv_attention_kernel(
|
|||||||
|
|
||||||
// Compute softmax.
|
// Compute softmax.
|
||||||
const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
|
const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
|
||||||
for (int i = thread_idx; i < context_len; i += NUM_THREADS) {
|
for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
|
||||||
logits[i] *= inv_sum;
|
logits[i] *= inv_sum;
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
|
// If partitioning is enabled, store the max logit and exp_sum.
|
||||||
|
if (USE_PARTITIONING && thread_idx == 0) {
|
||||||
|
float* max_logits_ptr = max_logits + seq_idx * num_heads * max_num_partitions
|
||||||
|
+ head_idx * max_num_partitions
|
||||||
|
+ partition_idx;
|
||||||
|
*max_logits_ptr = qk_max;
|
||||||
|
float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions
|
||||||
|
+ head_idx * max_num_partitions
|
||||||
|
+ partition_idx;
|
||||||
|
*exp_sums_ptr = exp_sum;
|
||||||
|
}
|
||||||
|
|
||||||
// Each thread will fetch 16 bytes from the value cache at a time.
|
// Each thread will fetch 16 bytes from the value cache at a time.
|
||||||
constexpr int V_VEC_SIZE = MIN(16 / sizeof(scalar_t), BLOCK_SIZE);
|
constexpr int V_VEC_SIZE = MIN(16 / sizeof(scalar_t), BLOCK_SIZE);
|
||||||
using V_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type;
|
using V_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type;
|
||||||
@@ -237,7 +276,7 @@ __global__ void single_query_cached_kv_attention_kernel(
|
|||||||
|
|
||||||
constexpr int NUM_V_VECS_PER_ROW = BLOCK_SIZE / V_VEC_SIZE;
|
constexpr int NUM_V_VECS_PER_ROW = BLOCK_SIZE / V_VEC_SIZE;
|
||||||
constexpr int NUM_ROWS_PER_ITER = WARP_SIZE / NUM_V_VECS_PER_ROW;
|
constexpr int NUM_ROWS_PER_ITER = WARP_SIZE / NUM_V_VECS_PER_ROW;
|
||||||
constexpr int NUM_ROWS_PER_THREAD = (HEAD_SIZE + NUM_ROWS_PER_ITER - 1) / NUM_ROWS_PER_ITER;
|
constexpr int NUM_ROWS_PER_THREAD = DIVIDE_ROUND_UP(HEAD_SIZE, NUM_ROWS_PER_ITER);
|
||||||
|
|
||||||
// NOTE(woosuk): We use FP32 for the accumulator for better accuracy.
|
// NOTE(woosuk): We use FP32 for the accumulator for better accuracy.
|
||||||
float accs[NUM_ROWS_PER_THREAD];
|
float accs[NUM_ROWS_PER_THREAD];
|
||||||
@@ -248,12 +287,15 @@ __global__ void single_query_cached_kv_attention_kernel(
|
|||||||
|
|
||||||
scalar_t zero_value;
|
scalar_t zero_value;
|
||||||
zero(zero_value);
|
zero(zero_value);
|
||||||
for (int block_idx = warp_idx; block_idx < num_blocks; block_idx += NUM_WARPS) {
|
for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) {
|
||||||
const int physical_block_number = block_table[block_idx];
|
// NOTE(woosuk): The block number is stored in int32. However, we cast it to int64
|
||||||
|
// because int32 can lead to overflow when this variable is multiplied by large numbers
|
||||||
|
// (e.g., kv_block_stride).
|
||||||
|
const int64_t physical_block_number = static_cast<int64_t>(block_table[block_idx]);
|
||||||
const int physical_block_offset = (lane % NUM_V_VECS_PER_ROW) * V_VEC_SIZE;
|
const int physical_block_offset = (lane % NUM_V_VECS_PER_ROW) * V_VEC_SIZE;
|
||||||
const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
|
const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
|
||||||
L_vec logits_vec;
|
L_vec logits_vec;
|
||||||
from_float(logits_vec, *reinterpret_cast<Float_L_vec*>(logits + token_idx));
|
from_float(logits_vec, *reinterpret_cast<Float_L_vec*>(logits + token_idx - start_token_idx));
|
||||||
|
|
||||||
const scalar_t* v_ptr = v_cache + physical_block_number * kv_block_stride
|
const scalar_t* v_ptr = v_cache + physical_block_number * kv_block_stride
|
||||||
+ kv_head_idx * kv_head_stride;
|
+ kv_head_idx * kv_head_stride;
|
||||||
@@ -263,13 +305,13 @@ __global__ void single_query_cached_kv_attention_kernel(
|
|||||||
if (row_idx < HEAD_SIZE) {
|
if (row_idx < HEAD_SIZE) {
|
||||||
const int offset = row_idx * BLOCK_SIZE + physical_block_offset;
|
const int offset = row_idx * BLOCK_SIZE + physical_block_offset;
|
||||||
V_vec v_vec = *reinterpret_cast<const V_vec*>(v_ptr + offset);
|
V_vec v_vec = *reinterpret_cast<const V_vec*>(v_ptr + offset);
|
||||||
if (block_idx == num_blocks - 1) {
|
if (block_idx == num_context_blocks - 1) {
|
||||||
// NOTE(woosuk): When v_vec contains the tokens that are out of the context,
|
// NOTE(woosuk): When v_vec contains the tokens that are out of the context,
|
||||||
// we should explicitly zero out the values since they may contain NaNs.
|
// we should explicitly zero out the values since they may contain NaNs.
|
||||||
// See https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
|
// See https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
|
||||||
scalar_t* v_vec_ptr = reinterpret_cast<scalar_t*>(&v_vec);
|
scalar_t* v_vec_ptr = reinterpret_cast<scalar_t*>(&v_vec);
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j = 0; j <= V_VEC_SIZE; j++) {
|
for (int j = 0; j < V_VEC_SIZE; j++) {
|
||||||
v_vec_ptr[j] = token_idx + j < context_len ? v_vec_ptr[j] : zero_value;
|
v_vec_ptr[j] = token_idx + j < context_len ? v_vec_ptr[j] : zero_value;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -327,7 +369,9 @@ __global__ void single_query_cached_kv_attention_kernel(
|
|||||||
|
|
||||||
// Write the final output.
|
// Write the final output.
|
||||||
if (warp_idx == 0) {
|
if (warp_idx == 0) {
|
||||||
scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
|
scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
|
||||||
|
+ head_idx * max_num_partitions * HEAD_SIZE
|
||||||
|
+ partition_idx * HEAD_SIZE;
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
|
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
|
||||||
const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
|
const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
|
||||||
@@ -338,10 +382,167 @@ __global__ void single_query_cached_kv_attention_kernel(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Grid: (num_heads, num_seqs, 1).
|
||||||
|
template<
|
||||||
|
typename scalar_t,
|
||||||
|
int HEAD_SIZE,
|
||||||
|
int BLOCK_SIZE,
|
||||||
|
int NUM_THREADS>
|
||||||
|
__global__ void paged_attention_v1_kernel(
|
||||||
|
scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
|
||||||
|
const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
|
||||||
|
const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x]
|
||||||
|
const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size]
|
||||||
|
const int* __restrict__ head_mapping, // [num_heads]
|
||||||
|
const float scale,
|
||||||
|
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||||
|
const int* __restrict__ context_lens, // [num_seqs]
|
||||||
|
const int max_num_blocks_per_seq,
|
||||||
|
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||||
|
const int q_stride,
|
||||||
|
const int kv_block_stride,
|
||||||
|
const int kv_head_stride) {
|
||||||
|
paged_attention_kernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS>(
|
||||||
|
/* exp_sums */ nullptr, /* max_logits */ nullptr,
|
||||||
|
out, q, k_cache, v_cache, head_mapping, scale, block_tables, context_lens,
|
||||||
|
max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Grid: (num_heads, num_seqs, max_num_partitions).
|
||||||
|
template<
|
||||||
|
typename scalar_t,
|
||||||
|
int HEAD_SIZE,
|
||||||
|
int BLOCK_SIZE,
|
||||||
|
int NUM_THREADS,
|
||||||
|
int PARTITION_SIZE>
|
||||||
|
__global__ void paged_attention_v2_kernel(
|
||||||
|
float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
|
||||||
|
float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
|
||||||
|
scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
|
||||||
|
const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
|
||||||
|
const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x]
|
||||||
|
const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size]
|
||||||
|
const int* __restrict__ head_mapping, // [num_heads]
|
||||||
|
const float scale,
|
||||||
|
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||||
|
const int* __restrict__ context_lens, // [num_seqs]
|
||||||
|
const int max_num_blocks_per_seq,
|
||||||
|
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||||
|
const int q_stride,
|
||||||
|
const int kv_block_stride,
|
||||||
|
const int kv_head_stride) {
|
||||||
|
paged_attention_kernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, PARTITION_SIZE>(
|
||||||
|
exp_sums, max_logits, tmp_out, q, k_cache, v_cache, head_mapping, scale,
|
||||||
|
block_tables, context_lens, max_num_blocks_per_seq, alibi_slopes,
|
||||||
|
q_stride, kv_block_stride, kv_head_stride);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Grid: (num_heads, num_seqs).
|
||||||
|
template<
|
||||||
|
typename scalar_t,
|
||||||
|
int HEAD_SIZE,
|
||||||
|
int NUM_THREADS,
|
||||||
|
int PARTITION_SIZE>
|
||||||
|
__global__ void paged_attention_v2_reduce_kernel(
|
||||||
|
scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
|
||||||
|
const float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
|
||||||
|
const float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
|
||||||
|
const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
|
||||||
|
const int* __restrict__ context_lens, // [num_seqs]
|
||||||
|
const int max_num_partitions) {
|
||||||
|
const int num_heads = gridDim.x;
|
||||||
|
const int head_idx = blockIdx.x;
|
||||||
|
const int seq_idx = blockIdx.y;
|
||||||
|
const int context_len = context_lens[seq_idx];
|
||||||
|
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
|
||||||
|
if (num_partitions == 1) {
|
||||||
|
// No need to reduce. Only copy tmp_out to out.
|
||||||
|
scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
|
||||||
|
const scalar_t* tmp_out_ptr = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
|
||||||
|
+ head_idx * max_num_partitions * HEAD_SIZE;
|
||||||
|
for (int i = threadIdx.x; i < HEAD_SIZE; i += blockDim.x) {
|
||||||
|
out_ptr[i] = tmp_out_ptr[i];
|
||||||
|
}
|
||||||
|
// Terminate the thread block.
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
||||||
|
const int warp_idx = threadIdx.x / WARP_SIZE;
|
||||||
|
const int lane = threadIdx.x % WARP_SIZE;
|
||||||
|
|
||||||
|
// Size: 2 * num_partitions.
|
||||||
|
extern __shared__ char shared_mem[];
|
||||||
|
// Workspace for reduction.
|
||||||
|
__shared__ float red_smem[2 * NUM_WARPS];
|
||||||
|
|
||||||
|
// Load max logits to shared memory.
|
||||||
|
float* shared_max_logits = reinterpret_cast<float*>(shared_mem);
|
||||||
|
const float* max_logits_ptr = max_logits + seq_idx * num_heads * max_num_partitions
|
||||||
|
+ head_idx * max_num_partitions;
|
||||||
|
float max_logit = -FLT_MAX;
|
||||||
|
for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
|
||||||
|
const float l = max_logits_ptr[i];
|
||||||
|
shared_max_logits[i] = l;
|
||||||
|
max_logit = fmaxf(max_logit, l);
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// Get the global max logit.
|
||||||
|
// Reduce within the warp.
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
|
||||||
|
max_logit = fmaxf(max_logit, __shfl_xor_sync(uint32_t(-1), max_logit, mask));
|
||||||
|
}
|
||||||
|
if (lane == 0) {
|
||||||
|
red_smem[warp_idx] = max_logit;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
// Reduce across warps.
|
||||||
|
max_logit = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
|
||||||
|
max_logit = fmaxf(max_logit, __shfl_xor_sync(uint32_t(-1), max_logit, mask));
|
||||||
|
}
|
||||||
|
// Broadcast the max value to all threads.
|
||||||
|
max_logit = __shfl_sync(uint32_t(-1), max_logit, 0);
|
||||||
|
|
||||||
|
// Load rescaled exp sums to shared memory.
|
||||||
|
float* shared_exp_sums = reinterpret_cast<float*>(shared_mem + sizeof(float) * num_partitions);
|
||||||
|
const float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions
|
||||||
|
+ head_idx * max_num_partitions;
|
||||||
|
float global_exp_sum = 0.0f;
|
||||||
|
for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
|
||||||
|
float l = shared_max_logits[i];
|
||||||
|
float rescaled_exp_sum = exp_sums_ptr[i] * expf(l - max_logit);
|
||||||
|
global_exp_sum += rescaled_exp_sum;
|
||||||
|
shared_exp_sums[i] = rescaled_exp_sum;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
global_exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], global_exp_sum);
|
||||||
|
const float inv_global_exp_sum = __fdividef(1.0f, global_exp_sum + 1e-6f);
|
||||||
|
|
||||||
|
// Aggregate tmp_out to out.
|
||||||
|
const scalar_t* tmp_out_ptr = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
|
||||||
|
+ head_idx * max_num_partitions * HEAD_SIZE;
|
||||||
|
scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = threadIdx.x; i < HEAD_SIZE; i += NUM_THREADS) {
|
||||||
|
float acc = 0.0f;
|
||||||
|
for (int j = 0; j < num_partitions; ++j) {
|
||||||
|
acc += to_float(tmp_out_ptr[j * HEAD_SIZE + i]) * shared_exp_sums[j] * inv_global_exp_sum;
|
||||||
|
}
|
||||||
|
from_float(out_ptr[i], acc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
#define LAUNCH_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS) \
|
#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \
|
||||||
vllm::single_query_cached_kv_attention_kernel<T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS> \
|
cudaFuncSetAttribute( \
|
||||||
|
vllm::paged_attention_v1_kernel<T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS>, \
|
||||||
|
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem_size); \
|
||||||
|
vllm::paged_attention_v1_kernel<T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS> \
|
||||||
<<<grid, block, shared_mem_size, stream>>>( \
|
<<<grid, block, shared_mem_size, stream>>>( \
|
||||||
out_ptr, \
|
out_ptr, \
|
||||||
query_ptr, \
|
query_ptr, \
|
||||||
@@ -362,7 +563,7 @@ template<
|
|||||||
typename T,
|
typename T,
|
||||||
int BLOCK_SIZE,
|
int BLOCK_SIZE,
|
||||||
int NUM_THREADS = 128>
|
int NUM_THREADS = 128>
|
||||||
void single_query_cached_kv_attention_launcher(
|
void paged_attention_v1_launcher(
|
||||||
torch::Tensor& out,
|
torch::Tensor& out,
|
||||||
torch::Tensor& query,
|
torch::Tensor& query,
|
||||||
torch::Tensor& key_cache,
|
torch::Tensor& key_cache,
|
||||||
@@ -398,43 +599,37 @@ void single_query_cached_kv_attention_launcher(
|
|||||||
int* context_lens_ptr = context_lens.data_ptr<int>();
|
int* context_lens_ptr = context_lens.data_ptr<int>();
|
||||||
|
|
||||||
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
||||||
int padded_max_context_len = ((max_context_len + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE;
|
int padded_max_context_len = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE) * BLOCK_SIZE;
|
||||||
int logits_size = padded_max_context_len * sizeof(float);
|
int logits_size = padded_max_context_len * sizeof(float);
|
||||||
int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
|
int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
|
||||||
|
// Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len
|
||||||
|
// Keep that in sync with the logic here!
|
||||||
int shared_mem_size = std::max(logits_size, outputs_size);
|
int shared_mem_size = std::max(logits_size, outputs_size);
|
||||||
|
|
||||||
dim3 grid(num_heads, num_seqs);
|
dim3 grid(num_heads, num_seqs, 1);
|
||||||
dim3 block(NUM_THREADS);
|
dim3 block(NUM_THREADS);
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
switch (head_size) {
|
switch (head_size) {
|
||||||
// NOTE(woosuk): To reduce the compilation time, we omitted head sizes
|
// NOTE(woosuk): To reduce the compilation time, we only compile for the
|
||||||
// 32, 160, 192.
|
// head sizes that we use in the model. However, we can easily extend this
|
||||||
// case 32:
|
// to support any head size which is a multiple of 16.
|
||||||
// LAUNCH_ATTENTION_KERNEL(T, 32, BLOCK_SIZE, NUM_THREADS);
|
|
||||||
// break;
|
|
||||||
case 64:
|
case 64:
|
||||||
LAUNCH_ATTENTION_KERNEL(T, 64, BLOCK_SIZE, NUM_THREADS);
|
LAUNCH_PAGED_ATTENTION_V1(64);
|
||||||
break;
|
break;
|
||||||
case 80:
|
case 80:
|
||||||
LAUNCH_ATTENTION_KERNEL(T, 80, BLOCK_SIZE, NUM_THREADS);
|
LAUNCH_PAGED_ATTENTION_V1(80);
|
||||||
break;
|
break;
|
||||||
case 96:
|
case 96:
|
||||||
LAUNCH_ATTENTION_KERNEL(T, 96, BLOCK_SIZE, NUM_THREADS);
|
LAUNCH_PAGED_ATTENTION_V1(96);
|
||||||
break;
|
break;
|
||||||
case 112:
|
case 112:
|
||||||
LAUNCH_ATTENTION_KERNEL(T, 112, BLOCK_SIZE, NUM_THREADS);
|
LAUNCH_PAGED_ATTENTION_V1(112);
|
||||||
break;
|
break;
|
||||||
case 128:
|
case 128:
|
||||||
LAUNCH_ATTENTION_KERNEL(T, 128, BLOCK_SIZE, NUM_THREADS);
|
LAUNCH_PAGED_ATTENTION_V1(128);
|
||||||
break;
|
break;
|
||||||
// case 160:
|
|
||||||
// LAUNCH_ATTENTION_KERNEL(T, 160, BLOCK_SIZE, NUM_THREADS);
|
|
||||||
// break;
|
|
||||||
// case 192:
|
|
||||||
// LAUNCH_ATTENTION_KERNEL(T, 192, BLOCK_SIZE, NUM_THREADS);
|
|
||||||
// break;
|
|
||||||
case 256:
|
case 256:
|
||||||
LAUNCH_ATTENTION_KERNEL(T, 256, BLOCK_SIZE, NUM_THREADS);
|
LAUNCH_PAGED_ATTENTION_V1(256);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
TORCH_CHECK(false, "Unsupported head size: ", head_size);
|
TORCH_CHECK(false, "Unsupported head size: ", head_size);
|
||||||
@@ -442,8 +637,8 @@ void single_query_cached_kv_attention_launcher(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#define CALL_KERNEL_LAUNCHER(T, BLOCK_SIZE) \
|
#define CALL_V1_LAUNCHER(T, BLOCK_SIZE) \
|
||||||
single_query_cached_kv_attention_launcher<T, BLOCK_SIZE>( \
|
paged_attention_v1_launcher<T, BLOCK_SIZE>( \
|
||||||
out, \
|
out, \
|
||||||
query, \
|
query, \
|
||||||
key_cache, \
|
key_cache, \
|
||||||
@@ -457,41 +652,23 @@ void single_query_cached_kv_attention_launcher(
|
|||||||
|
|
||||||
// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
|
// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
|
||||||
// 1, 2, 4, 64, 128, 256.
|
// 1, 2, 4, 64, 128, 256.
|
||||||
#define CALL_KERNEL_LAUNCHER_BLOCK_SIZE(T) \
|
#define CALL_V1_LAUNCHER_BLOCK_SIZE(T) \
|
||||||
switch (block_size) { \
|
switch (block_size) { \
|
||||||
/* case 1: */ \
|
|
||||||
/* CALL_KERNEL_LAUNCHER(T, 1); */ \
|
|
||||||
/* break; */ \
|
|
||||||
/* case 2: */ \
|
|
||||||
/* CALL_KERNEL_LAUNCHER(T, 2); */ \
|
|
||||||
/* break; */ \
|
|
||||||
/* case 4: */ \
|
|
||||||
/* CALL_KERNEL_LAUNCHER(T, 4); */ \
|
|
||||||
/* break; */ \
|
|
||||||
case 8: \
|
case 8: \
|
||||||
CALL_KERNEL_LAUNCHER(T, 8); \
|
CALL_V1_LAUNCHER(T, 8); \
|
||||||
break; \
|
break; \
|
||||||
case 16: \
|
case 16: \
|
||||||
CALL_KERNEL_LAUNCHER(T, 16); \
|
CALL_V1_LAUNCHER(T, 16); \
|
||||||
break; \
|
break; \
|
||||||
case 32: \
|
case 32: \
|
||||||
CALL_KERNEL_LAUNCHER(T, 32); \
|
CALL_V1_LAUNCHER(T, 32); \
|
||||||
break; \
|
break; \
|
||||||
/* case 64: */ \
|
|
||||||
/* CALL_KERNEL_LAUNCHER(T, 64); */ \
|
|
||||||
/* break; */ \
|
|
||||||
/* case 128: */ \
|
|
||||||
/* CALL_KERNEL_LAUNCHER(T, 128); */ \
|
|
||||||
/* break; */ \
|
|
||||||
/* case 256: */ \
|
|
||||||
/* CALL_KERNEL_LAUNCHER(T, 256); */ \
|
|
||||||
/* break; */ \
|
|
||||||
default: \
|
default: \
|
||||||
TORCH_CHECK(false, "Unsupported block size: ", block_size); \
|
TORCH_CHECK(false, "Unsupported block size: ", block_size); \
|
||||||
break; \
|
break; \
|
||||||
}
|
}
|
||||||
|
|
||||||
void single_query_cached_kv_attention(
|
void paged_attention_v1(
|
||||||
torch::Tensor& out, // [num_seqs, num_heads, head_size]
|
torch::Tensor& out, // [num_seqs, num_heads, head_size]
|
||||||
torch::Tensor& query, // [num_seqs, num_heads, head_size]
|
torch::Tensor& query, // [num_seqs, num_heads, head_size]
|
||||||
torch::Tensor& key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
|
torch::Tensor& key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
|
||||||
@@ -504,11 +681,186 @@ void single_query_cached_kv_attention(
|
|||||||
int max_context_len,
|
int max_context_len,
|
||||||
const c10::optional<torch::Tensor>& alibi_slopes) {
|
const c10::optional<torch::Tensor>& alibi_slopes) {
|
||||||
if (query.dtype() == at::ScalarType::Float) {
|
if (query.dtype() == at::ScalarType::Float) {
|
||||||
CALL_KERNEL_LAUNCHER_BLOCK_SIZE(float);
|
CALL_V1_LAUNCHER_BLOCK_SIZE(float);
|
||||||
} else if (query.dtype() == at::ScalarType::Half) {
|
} else if (query.dtype() == at::ScalarType::Half) {
|
||||||
CALL_KERNEL_LAUNCHER_BLOCK_SIZE(uint16_t);
|
CALL_V1_LAUNCHER_BLOCK_SIZE(uint16_t);
|
||||||
} else if (query.dtype() == at::ScalarType::BFloat16) {
|
} else if (query.dtype() == at::ScalarType::BFloat16) {
|
||||||
CALL_KERNEL_LAUNCHER_BLOCK_SIZE(__nv_bfloat16);
|
CALL_V1_LAUNCHER_BLOCK_SIZE(__nv_bfloat16);
|
||||||
|
} else {
|
||||||
|
TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \
|
||||||
|
vllm::paged_attention_v2_kernel<T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, PARTITION_SIZE> \
|
||||||
|
<<<grid, block, shared_mem_size, stream>>>( \
|
||||||
|
exp_sums_ptr, \
|
||||||
|
max_logits_ptr, \
|
||||||
|
tmp_out_ptr, \
|
||||||
|
query_ptr, \
|
||||||
|
key_cache_ptr, \
|
||||||
|
value_cache_ptr, \
|
||||||
|
head_mapping_ptr, \
|
||||||
|
scale, \
|
||||||
|
block_tables_ptr, \
|
||||||
|
context_lens_ptr, \
|
||||||
|
max_num_blocks_per_seq, \
|
||||||
|
alibi_slopes_ptr, \
|
||||||
|
q_stride, \
|
||||||
|
kv_block_stride, \
|
||||||
|
kv_head_stride); \
|
||||||
|
vllm::paged_attention_v2_reduce_kernel<T, HEAD_SIZE, NUM_THREADS, PARTITION_SIZE> \
|
||||||
|
<<<reduce_grid, block, reduce_shared_mem_size, stream>>>( \
|
||||||
|
out_ptr, \
|
||||||
|
exp_sums_ptr, \
|
||||||
|
max_logits_ptr, \
|
||||||
|
tmp_out_ptr, \
|
||||||
|
context_lens_ptr, \
|
||||||
|
max_num_partitions);
|
||||||
|
|
||||||
|
template<
|
||||||
|
typename T,
|
||||||
|
int BLOCK_SIZE,
|
||||||
|
int NUM_THREADS = 128,
|
||||||
|
int PARTITION_SIZE = 512>
|
||||||
|
void paged_attention_v2_launcher(
|
||||||
|
torch::Tensor& out,
|
||||||
|
torch::Tensor& exp_sums,
|
||||||
|
torch::Tensor& max_logits,
|
||||||
|
torch::Tensor& tmp_out,
|
||||||
|
torch::Tensor& query,
|
||||||
|
torch::Tensor& key_cache,
|
||||||
|
torch::Tensor& value_cache,
|
||||||
|
torch::Tensor& head_mapping,
|
||||||
|
float scale,
|
||||||
|
torch::Tensor& block_tables,
|
||||||
|
torch::Tensor& context_lens,
|
||||||
|
int max_context_len,
|
||||||
|
const c10::optional<torch::Tensor>& alibi_slopes) {
|
||||||
|
int num_seqs = query.size(0);
|
||||||
|
int num_heads = query.size(1);
|
||||||
|
int head_size = query.size(2);
|
||||||
|
int max_num_blocks_per_seq = block_tables.size(1);
|
||||||
|
int q_stride = query.stride(0);
|
||||||
|
int kv_block_stride = key_cache.stride(0);
|
||||||
|
int kv_head_stride = key_cache.stride(1);
|
||||||
|
|
||||||
|
int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
|
||||||
|
assert(head_size % thread_group_size == 0);
|
||||||
|
|
||||||
|
// NOTE: alibi_slopes is optional.
|
||||||
|
const float* alibi_slopes_ptr = alibi_slopes ?
|
||||||
|
reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
|
||||||
|
: nullptr;
|
||||||
|
|
||||||
|
T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
|
||||||
|
float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
|
||||||
|
float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
|
||||||
|
T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
|
||||||
|
T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
|
||||||
|
T* key_cache_ptr = reinterpret_cast<T*>(key_cache.data_ptr());
|
||||||
|
T* value_cache_ptr = reinterpret_cast<T*>(value_cache.data_ptr());
|
||||||
|
int* head_mapping_ptr = reinterpret_cast<int*>(head_mapping.data_ptr());
|
||||||
|
int* block_tables_ptr = block_tables.data_ptr<int>();
|
||||||
|
int* context_lens_ptr = context_lens.data_ptr<int>();
|
||||||
|
|
||||||
|
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
||||||
|
int max_num_partitions = DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
|
||||||
|
int logits_size = PARTITION_SIZE * sizeof(float);
|
||||||
|
int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
|
||||||
|
|
||||||
|
// For paged attention v2 kernel.
|
||||||
|
dim3 grid(num_heads, num_seqs, max_num_partitions);
|
||||||
|
int shared_mem_size = std::max(logits_size, outputs_size);
|
||||||
|
// For paged attention v2 reduce kernel.
|
||||||
|
dim3 reduce_grid(num_heads, num_seqs);
|
||||||
|
int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float);
|
||||||
|
|
||||||
|
dim3 block(NUM_THREADS);
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
switch (head_size) {
|
||||||
|
// NOTE(woosuk): To reduce the compilation time, we only compile for the
|
||||||
|
// head sizes that we use in the model. However, we can easily extend this
|
||||||
|
// to support any head size which is a multiple of 16.
|
||||||
|
case 64:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V2(64);
|
||||||
|
break;
|
||||||
|
case 80:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V2(80);
|
||||||
|
break;
|
||||||
|
case 96:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V2(96);
|
||||||
|
break;
|
||||||
|
case 112:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V2(112);
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V2(128);
|
||||||
|
break;
|
||||||
|
case 256:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V2(256);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
TORCH_CHECK(false, "Unsupported head size: ", head_size);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define CALL_V2_LAUNCHER(T, BLOCK_SIZE) \
|
||||||
|
paged_attention_v2_launcher<T, BLOCK_SIZE>( \
|
||||||
|
out, \
|
||||||
|
exp_sums, \
|
||||||
|
max_logits, \
|
||||||
|
tmp_out, \
|
||||||
|
query, \
|
||||||
|
key_cache, \
|
||||||
|
value_cache, \
|
||||||
|
head_mapping, \
|
||||||
|
scale, \
|
||||||
|
block_tables, \
|
||||||
|
context_lens, \
|
||||||
|
max_context_len, \
|
||||||
|
alibi_slopes);
|
||||||
|
|
||||||
|
// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
|
||||||
|
// 1, 2, 4, 64, 128, 256.
|
||||||
|
#define CALL_V2_LAUNCHER_BLOCK_SIZE(T) \
|
||||||
|
switch (block_size) { \
|
||||||
|
case 8: \
|
||||||
|
CALL_V2_LAUNCHER(T, 8); \
|
||||||
|
break; \
|
||||||
|
case 16: \
|
||||||
|
CALL_V2_LAUNCHER(T, 16); \
|
||||||
|
break; \
|
||||||
|
case 32: \
|
||||||
|
CALL_V2_LAUNCHER(T, 32); \
|
||||||
|
break; \
|
||||||
|
default: \
|
||||||
|
TORCH_CHECK(false, "Unsupported block size: ", block_size); \
|
||||||
|
break; \
|
||||||
|
}
|
||||||
|
|
||||||
|
void paged_attention_v2(
|
||||||
|
torch::Tensor& out, // [num_seqs, num_heads, head_size]
|
||||||
|
torch::Tensor& exp_sums, // [num_seqs, num_heads, max_num_partitions]
|
||||||
|
torch::Tensor& max_logits, // [num_seqs, num_heads, max_num_partitions]
|
||||||
|
torch::Tensor& tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
|
||||||
|
torch::Tensor& query, // [num_seqs, num_heads, head_size]
|
||||||
|
torch::Tensor& key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
|
||||||
|
torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size]
|
||||||
|
torch::Tensor& head_mapping, // [num_heads]
|
||||||
|
float scale,
|
||||||
|
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||||
|
torch::Tensor& context_lens, // [num_seqs]
|
||||||
|
int block_size,
|
||||||
|
int max_context_len,
|
||||||
|
const c10::optional<torch::Tensor>& alibi_slopes) {
|
||||||
|
if (query.dtype() == at::ScalarType::Float) {
|
||||||
|
CALL_V2_LAUNCHER_BLOCK_SIZE(float);
|
||||||
|
} else if (query.dtype() == at::ScalarType::Half) {
|
||||||
|
CALL_V2_LAUNCHER_BLOCK_SIZE(uint16_t);
|
||||||
|
} else if (query.dtype() == at::ScalarType::BFloat16) {
|
||||||
|
CALL_V2_LAUNCHER_BLOCK_SIZE(__nv_bfloat16);
|
||||||
} else {
|
} else {
|
||||||
TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
|
TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
|
||||||
}
|
}
|
||||||
@@ -517,3 +869,4 @@ void single_query_cached_kv_attention(
|
|||||||
#undef WARP_SIZE
|
#undef WARP_SIZE
|
||||||
#undef MAX
|
#undef MAX
|
||||||
#undef MIN
|
#undef MIN
|
||||||
|
#undef DIVIDE_ROUND_UP
|
||||||
|
|||||||
@@ -420,6 +420,11 @@ inline __device__ void from_float(bf16_8_t& dst, Float8_ src) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// From bfloat16 to float32.
|
||||||
|
inline __device__ float to_float(__nv_bfloat16 u) {
|
||||||
|
return __bfloat162float(u);
|
||||||
|
}
|
||||||
|
|
||||||
// Zero-out a variable.
|
// Zero-out a variable.
|
||||||
inline __device__ void zero(__nv_bfloat16& dst) {
|
inline __device__ void zero(__nv_bfloat16& dst) {
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
||||||
|
|||||||
@@ -26,22 +26,3 @@ void gather_cached_kv(
|
|||||||
torch::Tensor& key_cache,
|
torch::Tensor& key_cache,
|
||||||
torch::Tensor& value_cache,
|
torch::Tensor& value_cache,
|
||||||
torch::Tensor& slot_mapping);
|
torch::Tensor& slot_mapping);
|
||||||
|
|
||||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
|
||||||
m.def(
|
|
||||||
"swap_blocks",
|
|
||||||
&swap_blocks,
|
|
||||||
"Swap in (out) the cache blocks from src to dst");
|
|
||||||
m.def(
|
|
||||||
"copy_blocks",
|
|
||||||
©_blocks,
|
|
||||||
"Copy the cache blocks from src to dst");
|
|
||||||
m.def(
|
|
||||||
"reshape_and_cache",
|
|
||||||
&reshape_and_cache,
|
|
||||||
"Reshape the key and value tensors and cache them");
|
|
||||||
m.def(
|
|
||||||
"gather_cached_kv",
|
|
||||||
&gather_cached_kv,
|
|
||||||
"Gather key and value from the cache into contiguous QKV tensors");
|
|
||||||
}
|
|
||||||
@@ -55,26 +55,26 @@ template<typename scalar_t>
|
|||||||
__global__ void copy_blocks_kernel(
|
__global__ void copy_blocks_kernel(
|
||||||
int64_t* key_cache_ptrs,
|
int64_t* key_cache_ptrs,
|
||||||
int64_t* value_cache_ptrs,
|
int64_t* value_cache_ptrs,
|
||||||
const int* __restrict__ block_mapping,
|
const int64_t* __restrict__ block_mapping,
|
||||||
const int numel_per_block) {
|
const int numel_per_block) {
|
||||||
const int layer_idx = blockIdx.x;
|
const int layer_idx = blockIdx.x;
|
||||||
const int pair_idx = blockIdx.y;
|
const int pair_idx = blockIdx.y;
|
||||||
|
|
||||||
scalar_t* key_cache = reinterpret_cast<scalar_t*>(key_cache_ptrs[layer_idx]);
|
scalar_t* key_cache = reinterpret_cast<scalar_t*>(key_cache_ptrs[layer_idx]);
|
||||||
scalar_t* value_cache = reinterpret_cast<scalar_t*>(value_cache_ptrs[layer_idx]);
|
scalar_t* value_cache = reinterpret_cast<scalar_t*>(value_cache_ptrs[layer_idx]);
|
||||||
int src_block_number = block_mapping[2 * pair_idx];
|
int64_t src_block_number = block_mapping[2 * pair_idx];
|
||||||
int dst_block_number = block_mapping[2 * pair_idx + 1];
|
int64_t dst_block_number = block_mapping[2 * pair_idx + 1];
|
||||||
|
|
||||||
const int src_block_offset = src_block_number * numel_per_block;
|
const int64_t src_block_offset = src_block_number * numel_per_block;
|
||||||
const int dst_block_offset = dst_block_number * numel_per_block;
|
const int64_t dst_block_offset = dst_block_number * numel_per_block;
|
||||||
for (int i = threadIdx.x; i < numel_per_block; i += blockDim.x) {
|
for (int i = threadIdx.x; i < numel_per_block; i += blockDim.x) {
|
||||||
int src_offset = src_block_offset + i;
|
int64_t src_offset = src_block_offset + i;
|
||||||
int dst_offset = dst_block_offset + i;
|
int64_t dst_offset = dst_block_offset + i;
|
||||||
key_cache[dst_offset] = key_cache[src_offset];
|
key_cache[dst_offset] = key_cache[src_offset];
|
||||||
}
|
}
|
||||||
for (int i = threadIdx.x; i < numel_per_block; i += blockDim.x) {
|
for (int i = threadIdx.x; i < numel_per_block; i += blockDim.x) {
|
||||||
int src_offset = src_block_offset + i;
|
int64_t src_offset = src_block_offset + i;
|
||||||
int dst_offset = dst_block_offset + i;
|
int64_t dst_offset = dst_block_offset + i;
|
||||||
value_cache[dst_offset] = value_cache[src_offset];
|
value_cache[dst_offset] = value_cache[src_offset];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -102,15 +102,15 @@ void copy_blocks(
|
|||||||
value_cache_ptrs[layer_idx] = reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
|
value_cache_ptrs[layer_idx] = reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
|
||||||
}
|
}
|
||||||
// Create block mapping array.
|
// Create block mapping array.
|
||||||
std::vector<int> block_mapping_vec;
|
std::vector<int64_t> block_mapping_vec;
|
||||||
for (const auto& pair : block_mapping) {
|
for (const auto& pair : block_mapping) {
|
||||||
int src_block_number = pair.first;
|
int64_t src_block_number = pair.first;
|
||||||
for (int dst_block_number : pair.second) {
|
for (int64_t dst_block_number : pair.second) {
|
||||||
block_mapping_vec.push_back(src_block_number);
|
block_mapping_vec.push_back(src_block_number);
|
||||||
block_mapping_vec.push_back(dst_block_number);
|
block_mapping_vec.push_back(dst_block_number);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int* block_mapping_array = block_mapping_vec.data();
|
int64_t* block_mapping_array = block_mapping_vec.data();
|
||||||
int num_pairs = block_mapping_vec.size() / 2;
|
int num_pairs = block_mapping_vec.size() / 2;
|
||||||
|
|
||||||
// Move the data structures to the GPU.
|
// Move the data structures to the GPU.
|
||||||
@@ -120,7 +120,7 @@ void copy_blocks(
|
|||||||
torch::Tensor value_cache_ptrs_tensor = torch::from_blob(
|
torch::Tensor value_cache_ptrs_tensor = torch::from_blob(
|
||||||
value_cache_ptrs, {num_layers}, torch::kInt64).to(cache_device);
|
value_cache_ptrs, {num_layers}, torch::kInt64).to(cache_device);
|
||||||
torch::Tensor block_mapping_tensor = torch::from_blob(
|
torch::Tensor block_mapping_tensor = torch::from_blob(
|
||||||
block_mapping_array, {2 * num_pairs}, torch::kInt).to(cache_device);
|
block_mapping_array, {2 * num_pairs}, torch::kInt64).to(cache_device);
|
||||||
|
|
||||||
// Launch the kernel.
|
// Launch the kernel.
|
||||||
const int numel_per_block = key_caches[0][0].numel();
|
const int numel_per_block = key_caches[0][0].numel();
|
||||||
@@ -132,7 +132,7 @@ void copy_blocks(
|
|||||||
vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||||
key_cache_ptrs_tensor.data_ptr<int64_t>(),
|
key_cache_ptrs_tensor.data_ptr<int64_t>(),
|
||||||
value_cache_ptrs_tensor.data_ptr<int64_t>(),
|
value_cache_ptrs_tensor.data_ptr<int64_t>(),
|
||||||
block_mapping_tensor.data_ptr<int>(),
|
block_mapping_tensor.data_ptr<int64_t>(),
|
||||||
numel_per_block);
|
numel_per_block);
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
@@ -141,43 +141,48 @@ namespace vllm {
|
|||||||
|
|
||||||
template<typename scalar_t>
|
template<typename scalar_t>
|
||||||
__global__ void reshape_and_cache_kernel(
|
__global__ void reshape_and_cache_kernel(
|
||||||
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
||||||
const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size]
|
const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size]
|
||||||
scalar_t* __restrict__ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
|
scalar_t* __restrict__ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
|
||||||
scalar_t* __restrict__ value_cache, // [num_blocks, num_heads, head_size, block_size]
|
scalar_t* __restrict__ value_cache, // [num_blocks, num_heads, head_size, block_size]
|
||||||
const int* __restrict__ slot_mapping, // [num_tokens]
|
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
||||||
const int key_stride,
|
const int key_stride,
|
||||||
const int value_stride,
|
const int value_stride,
|
||||||
const int num_heads,
|
const int num_heads,
|
||||||
const int head_size,
|
const int head_size,
|
||||||
const int block_size,
|
const int block_size,
|
||||||
const int x) {
|
const int x) {
|
||||||
const int token_idx = blockIdx.x;
|
const int64_t token_idx = blockIdx.x;
|
||||||
const int slot_idx = slot_mapping[token_idx];
|
const int64_t slot_idx = slot_mapping[token_idx];
|
||||||
const int block_idx = slot_idx / block_size;
|
if (slot_idx < 0) {
|
||||||
const int block_offset = slot_idx % block_size;
|
// Padding token that should be ignored.
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t block_idx = slot_idx / block_size;
|
||||||
|
const int64_t block_offset = slot_idx % block_size;
|
||||||
|
|
||||||
const int n = num_heads * head_size;
|
const int n = num_heads * head_size;
|
||||||
for (int i = threadIdx.x; i < n; i += blockDim.x) {
|
for (int i = threadIdx.x; i < n; i += blockDim.x) {
|
||||||
const int src_key_idx = token_idx * key_stride + i;
|
const int64_t src_key_idx = token_idx * key_stride + i;
|
||||||
const int src_value_idx = token_idx * value_stride + i;
|
const int64_t src_value_idx = token_idx * value_stride + i;
|
||||||
|
|
||||||
const int head_idx = i / head_size;
|
const int head_idx = i / head_size;
|
||||||
const int head_offset = i % head_size;
|
const int head_offset = i % head_size;
|
||||||
const int x_idx = head_offset / x;
|
const int x_idx = head_offset / x;
|
||||||
const int x_offset = head_offset % x;
|
const int x_offset = head_offset % x;
|
||||||
|
|
||||||
const int tgt_key_idx = block_idx * num_heads * (head_size / x) * block_size * x
|
const int64_t tgt_key_idx = block_idx * num_heads * (head_size / x) * block_size * x
|
||||||
+ head_idx * (head_size / x) * block_size * x
|
+ head_idx * (head_size / x) * block_size * x
|
||||||
+ x_idx * block_size * x
|
+ x_idx * block_size * x
|
||||||
+ block_offset * x
|
+ block_offset * x
|
||||||
+ x_offset;
|
+ x_offset;
|
||||||
const int tgt_value_idx = block_idx * num_heads * head_size * block_size
|
const int64_t tgt_value_idx = block_idx * num_heads * head_size * block_size
|
||||||
+ head_idx * head_size * block_size
|
+ head_idx * head_size * block_size
|
||||||
+ head_offset * block_size
|
+ head_offset * block_size
|
||||||
+ block_offset;
|
+ block_offset;
|
||||||
key_cache[tgt_key_idx] = __ldg(&key[src_key_idx]);
|
key_cache[tgt_key_idx] = key[src_key_idx];
|
||||||
value_cache[tgt_value_idx] = __ldg(&value[src_value_idx]);
|
value_cache[tgt_value_idx] = value[src_value_idx];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -211,7 +216,7 @@ void reshape_and_cache(
|
|||||||
value.data_ptr<scalar_t>(),
|
value.data_ptr<scalar_t>(),
|
||||||
key_cache.data_ptr<scalar_t>(),
|
key_cache.data_ptr<scalar_t>(),
|
||||||
value_cache.data_ptr<scalar_t>(),
|
value_cache.data_ptr<scalar_t>(),
|
||||||
slot_mapping.data_ptr<int>(),
|
slot_mapping.data_ptr<int64_t>(),
|
||||||
key_stride,
|
key_stride,
|
||||||
value_stride,
|
value_stride,
|
||||||
num_heads,
|
num_heads,
|
||||||
|
|||||||
5
csrc/cuda_utils.h
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
#include <torch/extension.h>
|
||||||
|
|
||||||
|
int get_device_attribute(
|
||||||
|
int attribute,
|
||||||
|
int device_id);
|
||||||
14
csrc/cuda_utils_kernels.cu
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
int get_device_attribute(
|
||||||
|
int attribute,
|
||||||
|
int device_id)
|
||||||
|
{
|
||||||
|
int device, value;
|
||||||
|
if (device_id < 0) {
|
||||||
|
cudaGetDevice(&device);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
device = device_id;
|
||||||
|
}
|
||||||
|
cudaDeviceGetAttribute(&value, static_cast<cudaDeviceAttr>(attribute), device);
|
||||||
|
return value;
|
||||||
|
}
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
#include <torch/extension.h>
|
|
||||||
|
|
||||||
void rms_norm(
|
|
||||||
torch::Tensor& out,
|
|
||||||
torch::Tensor& input,
|
|
||||||
torch::Tensor& weight,
|
|
||||||
float epsilon);
|
|
||||||
|
|
||||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
|
||||||
m.def(
|
|
||||||
"rms_norm",
|
|
||||||
&rms_norm,
|
|
||||||
"Apply Root Mean Square (RMS) Normalization to the input tensor.");
|
|
||||||
}
|
|
||||||
@@ -9,8 +9,8 @@ namespace vllm {
|
|||||||
// TODO(woosuk): Further optimize this kernel.
|
// TODO(woosuk): Further optimize this kernel.
|
||||||
template<typename scalar_t>
|
template<typename scalar_t>
|
||||||
__global__ void rms_norm_kernel(
|
__global__ void rms_norm_kernel(
|
||||||
scalar_t* __restrict__ out, // [num_tokens, hidden_size]
|
scalar_t* __restrict__ out, // [..., hidden_size]
|
||||||
const scalar_t* __restrict__ input, // [num_tokens, hidden_size]
|
const scalar_t* __restrict__ input, // [..., hidden_size]
|
||||||
const scalar_t* __restrict__ weight, // [hidden_size]
|
const scalar_t* __restrict__ weight, // [hidden_size]
|
||||||
const float epsilon,
|
const float epsilon,
|
||||||
const int num_tokens,
|
const int num_tokens,
|
||||||
@@ -34,15 +34,45 @@ __global__ void rms_norm_kernel(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: Further optimize this kernel.
|
||||||
|
template<typename scalar_t>
|
||||||
|
__global__ void fused_add_rms_norm_kernel(
|
||||||
|
scalar_t* __restrict__ input, // [..., hidden_size]
|
||||||
|
scalar_t* __restrict__ residual, // [..., hidden_size]
|
||||||
|
const scalar_t* __restrict__ weight, // [hidden_size]
|
||||||
|
const float epsilon,
|
||||||
|
const int num_tokens,
|
||||||
|
const int hidden_size) {
|
||||||
|
__shared__ float s_variance;
|
||||||
|
float variance = 0.0f;
|
||||||
|
|
||||||
|
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||||
|
float x = (float) input[blockIdx.x * hidden_size + idx];
|
||||||
|
x += (float) residual[blockIdx.x * hidden_size + idx];
|
||||||
|
variance += x * x;
|
||||||
|
residual[blockIdx.x * hidden_size + idx] = (scalar_t) x;
|
||||||
|
}
|
||||||
|
variance = blockReduceSum<float>(variance);
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||||
|
float x = (float) residual[blockIdx.x * hidden_size + idx];
|
||||||
|
input[blockIdx.x * hidden_size + idx] = ((scalar_t) (x * s_variance)) * weight[idx];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
void rms_norm(
|
void rms_norm(
|
||||||
torch::Tensor& out, // [num_tokens, hidden_size]
|
torch::Tensor& out, // [..., hidden_size]
|
||||||
torch::Tensor& input, // [num_tokens, hidden_size]
|
torch::Tensor& input, // [..., hidden_size]
|
||||||
torch::Tensor& weight, // [hidden_size]
|
torch::Tensor& weight, // [hidden_size]
|
||||||
float epsilon) {
|
float epsilon) {
|
||||||
int num_tokens = input.size(0);
|
int hidden_size = input.size(-1);
|
||||||
int hidden_size = input.size(1);
|
int num_tokens = input.numel() / hidden_size;
|
||||||
|
|
||||||
dim3 grid(num_tokens);
|
dim3 grid(num_tokens);
|
||||||
dim3 block(std::min(hidden_size, 1024));
|
dim3 block(std::min(hidden_size, 1024));
|
||||||
@@ -60,3 +90,28 @@ void rms_norm(
|
|||||||
hidden_size);
|
hidden_size);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void fused_add_rms_norm(
|
||||||
|
torch::Tensor& input, // [..., hidden_size]
|
||||||
|
torch::Tensor& residual, // [..., hidden_size]
|
||||||
|
torch::Tensor& weight, // [hidden_size]
|
||||||
|
float epsilon) {
|
||||||
|
int hidden_size = input.size(-1);
|
||||||
|
int num_tokens = input.numel() / hidden_size;
|
||||||
|
|
||||||
|
dim3 grid(num_tokens);
|
||||||
|
dim3 block(std::min(hidden_size, 1024));
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
|
input.scalar_type(),
|
||||||
|
"fused_add_rms_norm_kernel",
|
||||||
|
[&] {
|
||||||
|
vllm::fused_add_rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||||
|
input.data_ptr<scalar_t>(),
|
||||||
|
residual.data_ptr<scalar_t>(),
|
||||||
|
weight.data_ptr<scalar_t>(),
|
||||||
|
epsilon,
|
||||||
|
num_tokens,
|
||||||
|
hidden_size);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|||||||
75
csrc/ops.h
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
#include <torch/extension.h>
|
||||||
|
|
||||||
|
void paged_attention_v1(
|
||||||
|
torch::Tensor& out,
|
||||||
|
torch::Tensor& query,
|
||||||
|
torch::Tensor& key_cache,
|
||||||
|
torch::Tensor& value_cache,
|
||||||
|
torch::Tensor& head_mapping,
|
||||||
|
float scale,
|
||||||
|
torch::Tensor& block_tables,
|
||||||
|
torch::Tensor& context_lens,
|
||||||
|
int block_size,
|
||||||
|
int max_context_len,
|
||||||
|
const c10::optional<torch::Tensor>& alibi_slopes);
|
||||||
|
|
||||||
|
void paged_attention_v2(
|
||||||
|
torch::Tensor& out,
|
||||||
|
torch::Tensor& exp_sums,
|
||||||
|
torch::Tensor& max_logits,
|
||||||
|
torch::Tensor& tmp_out,
|
||||||
|
torch::Tensor& query,
|
||||||
|
torch::Tensor& key_cache,
|
||||||
|
torch::Tensor& value_cache,
|
||||||
|
torch::Tensor& head_mapping,
|
||||||
|
float scale,
|
||||||
|
torch::Tensor& block_tables,
|
||||||
|
torch::Tensor& context_lens,
|
||||||
|
int block_size,
|
||||||
|
int max_context_len,
|
||||||
|
const c10::optional<torch::Tensor>& alibi_slopes);
|
||||||
|
|
||||||
|
void rms_norm(
|
||||||
|
torch::Tensor& out,
|
||||||
|
torch::Tensor& input,
|
||||||
|
torch::Tensor& weight,
|
||||||
|
float epsilon);
|
||||||
|
|
||||||
|
void fused_add_rms_norm(
|
||||||
|
torch::Tensor& input,
|
||||||
|
torch::Tensor& residual,
|
||||||
|
torch::Tensor& weight,
|
||||||
|
float epsilon);
|
||||||
|
|
||||||
|
void rotary_embedding(
|
||||||
|
torch::Tensor& positions,
|
||||||
|
torch::Tensor& query,
|
||||||
|
torch::Tensor& key,
|
||||||
|
int head_size,
|
||||||
|
torch::Tensor& cos_sin_cache,
|
||||||
|
bool is_neox);
|
||||||
|
|
||||||
|
void silu_and_mul(
|
||||||
|
torch::Tensor& out,
|
||||||
|
torch::Tensor& input);
|
||||||
|
|
||||||
|
void gelu_new(
|
||||||
|
torch::Tensor& out,
|
||||||
|
torch::Tensor& input);
|
||||||
|
|
||||||
|
void gelu_fast(
|
||||||
|
torch::Tensor& out,
|
||||||
|
torch::Tensor& input);
|
||||||
|
|
||||||
|
torch::Tensor awq_gemm(
|
||||||
|
torch::Tensor _in_feats,
|
||||||
|
torch::Tensor _kernel,
|
||||||
|
torch::Tensor _scaling_factors,
|
||||||
|
torch::Tensor _zeros,
|
||||||
|
int split_k_iters);
|
||||||
|
|
||||||
|
void squeezellm_gemm(
|
||||||
|
torch::Tensor vec,
|
||||||
|
torch::Tensor mat,
|
||||||
|
torch::Tensor mul,
|
||||||
|
torch::Tensor lookup_table);
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
#include <torch/extension.h>
|
|
||||||
|
|
||||||
void rotary_embedding(
|
|
||||||
torch::Tensor& positions,
|
|
||||||
torch::Tensor& query,
|
|
||||||
torch::Tensor& key,
|
|
||||||
int head_size,
|
|
||||||
torch::Tensor& cos_sin_cache,
|
|
||||||
bool is_neox);
|
|
||||||
|
|
||||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
|
||||||
m.def(
|
|
||||||
"rotary_embedding",
|
|
||||||
&rotary_embedding,
|
|
||||||
"Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
|
|
||||||
}
|
|
||||||
@@ -37,9 +37,9 @@ inline __device__ void apply_rotary_embedding(
|
|||||||
|
|
||||||
template<typename scalar_t, bool IS_NEOX>
|
template<typename scalar_t, bool IS_NEOX>
|
||||||
__global__ void rotary_embedding_kernel(
|
__global__ void rotary_embedding_kernel(
|
||||||
const int64_t* __restrict__ positions, // [num_tokens]
|
const int64_t* __restrict__ positions, // [batch_size, seq_len] or [num_tokens]
|
||||||
scalar_t* __restrict__ query, // [num_tokens, num_heads, head_size]
|
scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size]
|
||||||
scalar_t* __restrict__ key, // [num_tokens, num_kv_heads, head_size]
|
scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size]
|
||||||
const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2]
|
const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2]
|
||||||
const int rot_dim,
|
const int rot_dim,
|
||||||
const int query_stride,
|
const int query_stride,
|
||||||
@@ -78,18 +78,18 @@ __global__ void rotary_embedding_kernel(
|
|||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
void rotary_embedding(
|
void rotary_embedding(
|
||||||
torch::Tensor& positions, // [num_tokens]
|
torch::Tensor& positions, // [batch_size, seq_len] or [num_tokens]
|
||||||
torch::Tensor& query, // [num_tokens, num_heads * head_size]
|
torch::Tensor& query, // [batch_size, seq_len, num_heads * head_size] or [num_tokens, num_heads * head_size]
|
||||||
torch::Tensor& key, // [num_tokens, num_kv_heads * head_size]
|
torch::Tensor& key, // [batch_size, seq_len, num_kv_heads * head_size] or [num_tokens, num_kv_heads * head_size]
|
||||||
int head_size,
|
int head_size,
|
||||||
torch::Tensor& cos_sin_cache, // [max_position, rot_dim]
|
torch::Tensor& cos_sin_cache, // [max_position, rot_dim]
|
||||||
bool is_neox) {
|
bool is_neox) {
|
||||||
int num_tokens = query.size(0);
|
int64_t num_tokens = query.numel() / query.size(-1);
|
||||||
int rot_dim = cos_sin_cache.size(1);
|
int rot_dim = cos_sin_cache.size(1);
|
||||||
int num_heads = query.size(1) / head_size;
|
int num_heads = query.size(-1) / head_size;
|
||||||
int num_kv_heads = key.size(1) / head_size;
|
int num_kv_heads = key.size(-1) / head_size;
|
||||||
int query_stride = query.stride(0);
|
int query_stride = query.stride(-2);
|
||||||
int key_stride = key.stride(0);
|
int key_stride = key.stride(-2);
|
||||||
|
|
||||||
dim3 grid(num_tokens);
|
dim3 grid(num_tokens);
|
||||||
dim3 block(std::min(num_heads * rot_dim / 2, 512));
|
dim3 block(std::min(num_heads * rot_dim / 2, 512));
|
||||||
|
|||||||
80
csrc/pybind.cpp
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
#include "cache.h"
|
||||||
|
#include "cuda_utils.h"
|
||||||
|
#include "ops.h"
|
||||||
|
#include <torch/extension.h>
|
||||||
|
|
||||||
|
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||||
|
// vLLM custom ops
|
||||||
|
pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
|
||||||
|
|
||||||
|
// Attention ops
|
||||||
|
ops.def(
|
||||||
|
"paged_attention_v1",
|
||||||
|
&paged_attention_v1,
|
||||||
|
"Compute the attention between an input query and the cached keys/values using PagedAttention.");
|
||||||
|
ops.def(
|
||||||
|
"paged_attention_v2",
|
||||||
|
&paged_attention_v2,
|
||||||
|
"PagedAttention V2.");
|
||||||
|
|
||||||
|
// Activation ops
|
||||||
|
ops.def(
|
||||||
|
"silu_and_mul",
|
||||||
|
&silu_and_mul,
|
||||||
|
"Activation function used in SwiGLU.");
|
||||||
|
ops.def(
|
||||||
|
"gelu_new",
|
||||||
|
&gelu_new,
|
||||||
|
"GELU implementation used in GPT-2.");
|
||||||
|
ops.def(
|
||||||
|
"gelu_fast",
|
||||||
|
&gelu_fast,
|
||||||
|
"Approximate GELU implementation.");
|
||||||
|
|
||||||
|
// Layernorm
|
||||||
|
ops.def(
|
||||||
|
"rms_norm",
|
||||||
|
&rms_norm,
|
||||||
|
"Apply Root Mean Square (RMS) Normalization to the input tensor.");
|
||||||
|
|
||||||
|
ops.def(
|
||||||
|
"fused_add_rms_norm",
|
||||||
|
&fused_add_rms_norm,
|
||||||
|
"In-place fused Add and RMS Normalization");
|
||||||
|
|
||||||
|
// Rotary embedding
|
||||||
|
ops.def(
|
||||||
|
"rotary_embedding",
|
||||||
|
&rotary_embedding,
|
||||||
|
"Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
|
||||||
|
|
||||||
|
// Quantization ops
|
||||||
|
ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
|
||||||
|
ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
|
||||||
|
|
||||||
|
// Cache ops
|
||||||
|
pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
|
||||||
|
cache_ops.def(
|
||||||
|
"swap_blocks",
|
||||||
|
&swap_blocks,
|
||||||
|
"Swap in (out) the cache blocks from src to dst");
|
||||||
|
cache_ops.def(
|
||||||
|
"copy_blocks",
|
||||||
|
©_blocks,
|
||||||
|
"Copy the cache blocks from src to dst");
|
||||||
|
cache_ops.def(
|
||||||
|
"reshape_and_cache",
|
||||||
|
&reshape_and_cache,
|
||||||
|
"Reshape the key and value tensors and cache them");
|
||||||
|
cache_ops.def(
|
||||||
|
"gather_cached_kv",
|
||||||
|
&gather_cached_kv,
|
||||||
|
"Gather key and value from the cache into contiguous QKV tensors");
|
||||||
|
|
||||||
|
// Cuda utils
|
||||||
|
pybind11::module cuda_utils = m.def_submodule("cuda_utils", "vLLM cuda utils");
|
||||||
|
cuda_utils.def(
|
||||||
|
"get_device_attribute",
|
||||||
|
&get_device_attribute,
|
||||||
|
"Gets the specified device attribute.");
|
||||||
|
}
|
||||||
87
csrc/quantization/awq/dequantize.cuh
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
/*
|
||||||
|
Adapted from https://github.com/mit-han-lab/llm-awq
|
||||||
|
Modified from NVIDIA FasterTransformer: https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
|
||||||
|
@article{lin2023awq,
|
||||||
|
title={AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration},
|
||||||
|
author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Dang, Xingyu and Han, Song},
|
||||||
|
journal={arXiv},
|
||||||
|
year={2023}
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
namespace vllm {
|
||||||
|
namespace awq {
|
||||||
|
|
||||||
|
__device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source)
|
||||||
|
{
|
||||||
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
|
||||||
|
assert(false);
|
||||||
|
#else
|
||||||
|
uint4 result;
|
||||||
|
|
||||||
|
uint32_t* h = reinterpret_cast<uint32_t*>(&result);
|
||||||
|
uint32_t const i4s = reinterpret_cast<uint32_t const&>(source);
|
||||||
|
|
||||||
|
// First, we extract the i4s and construct an intermediate fp16 number.
|
||||||
|
static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
|
||||||
|
static constexpr uint32_t BOTTOM_MASK = 0x000f000f;
|
||||||
|
static constexpr uint32_t TOP_MASK = 0x00f000f0;
|
||||||
|
static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400;
|
||||||
|
|
||||||
|
// Note that the entire sequence only requires 1 shift instruction. This is thanks to the register packing
|
||||||
|
// format and the fact that we force our integers to be unsigned, and account for this in the fp16 subtractions.
|
||||||
|
// In addition, I exploit the fact that sub and fma have the same throughput in order to convert elt_23 and
|
||||||
|
// elt_67 to fp16 without having to shift them to the bottom bits before hand.
|
||||||
|
|
||||||
|
// Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW dependency if we issue
|
||||||
|
// immediately before required.
|
||||||
|
const uint32_t top_i4s = i4s >> 8;
|
||||||
|
// Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
|
||||||
|
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
|
||||||
|
: "=r"(h[0])
|
||||||
|
: "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
|
||||||
|
// Extract elt_23 (i4s & 0x00f000f0) | 0x64006400
|
||||||
|
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
|
||||||
|
: "=r"(h[1])
|
||||||
|
: "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
|
||||||
|
// Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400
|
||||||
|
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
|
||||||
|
: "=r"(h[2])
|
||||||
|
: "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
|
||||||
|
// Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400
|
||||||
|
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
|
||||||
|
: "=r"(h[3])
|
||||||
|
: "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
|
||||||
|
|
||||||
|
// I use inline PTX below because I am not sure if the compiler will emit float2half instructions if I use the
|
||||||
|
// half2 ctor. In this case, I chose performance reliability over code readability.
|
||||||
|
|
||||||
|
// This is the half2 {1032, 1032} represented as an integer.
|
||||||
|
// static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408;
|
||||||
|
// Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7]
|
||||||
|
static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400;
|
||||||
|
// This is the half2 {1 / 16, 1 / 16} represented as an integer.
|
||||||
|
static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00;
|
||||||
|
// This is the half2 {-72, -72} represented as an integer.
|
||||||
|
// static constexpr uint32_t NEG_72 = 0xd480d480;
|
||||||
|
// Haotian: Let's use {-64, -64}.
|
||||||
|
static constexpr uint32_t NEG_64 = 0xd400d400;
|
||||||
|
|
||||||
|
// Finally, we construct the output numbers.
|
||||||
|
// Convert elt_01
|
||||||
|
asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
|
||||||
|
// Convert elt_23
|
||||||
|
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
|
||||||
|
// Convert elt_45
|
||||||
|
asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
|
||||||
|
// Convert elt_67
|
||||||
|
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
|
||||||
|
|
||||||
|
return result;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace awq
|
||||||
|
} // namespace vllm
|
||||||
560
csrc/quantization/awq/gemm_kernels.cu
Normal file
@@ -0,0 +1,560 @@
|
|||||||
|
/*
|
||||||
|
Adapted from https://github.com/mit-han-lab/llm-awq
|
||||||
|
@article{lin2023awq,
|
||||||
|
title={AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration},
|
||||||
|
author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Dang, Xingyu and Han, Song},
|
||||||
|
journal={arXiv},
|
||||||
|
year={2023}
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#include <torch/extension.h>
|
||||||
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
|
||||||
|
#include "dequantize.cuh"
|
||||||
|
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
|
||||||
|
namespace vllm {
|
||||||
|
namespace awq {
|
||||||
|
|
||||||
|
// Pack two half values.
|
||||||
|
static inline __device__ __host__ unsigned
|
||||||
|
__pack_half2(const half x, const half y) {
|
||||||
|
unsigned v0 = *((unsigned short *)&x);
|
||||||
|
unsigned v1 = *((unsigned short *)&y);
|
||||||
|
return (v1 << 16) | v0;
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, int split_k_iters, half* __restrict__ A, int* __restrict__ B, half* __restrict__ scaling_factors, int* __restrict__ zeros, int M, int IC, int OC, half* __restrict__ C)
|
||||||
|
{
|
||||||
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
|
||||||
|
assert(false);
|
||||||
|
#else
|
||||||
|
static constexpr uint32_t ZERO = 0x0;
|
||||||
|
float C_warp[32];
|
||||||
|
__shared__ half A_shared[16 * (32 + 8)];
|
||||||
|
__shared__ half B_shared[32 * (128 + 8)];
|
||||||
|
|
||||||
|
__shared__ half scaling_factors_shared[128];
|
||||||
|
__shared__ half zeros_shared[128];
|
||||||
|
|
||||||
|
int j_factors1 = ((OC + 128 - 1) / 128);
|
||||||
|
int blockIdx_x = 0;
|
||||||
|
int blockIdx_y = blockIdx.x % ((M + 16 - 1) / 16 * j_factors1);
|
||||||
|
int blockIdx_z = blockIdx.x / ((M + 16 - 1) / 16 * j_factors1);
|
||||||
|
|
||||||
|
half A_shared_warp[8];
|
||||||
|
half B_shared_warp[32];
|
||||||
|
for (int j_0_4_init = 0; j_0_4_init < 4; ++j_0_4_init) {
|
||||||
|
for (int i = 0; i < 8; ++i) {
|
||||||
|
C_warp[(j_0_4_init * 8) + i] = 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr int row_stride_warp = 32 * 8 / 32;
|
||||||
|
static constexpr int row_stride = 2 * 32 * 8 / 128;
|
||||||
|
bool ld_zero_flag = (threadIdx.y * 32 + threadIdx.x) * 8 < 128;
|
||||||
|
// TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16
|
||||||
|
bool ld_A_flag = (blockIdx_y / j_factors1 * 16 + threadIdx.y * row_stride_warp + threadIdx.x * 8 / 32) < M; // threadIdx.y is warp_id
|
||||||
|
// bool wb_C_flag = (threadIdx.x / 4) < M;
|
||||||
|
|
||||||
|
half* A_ptr = A
|
||||||
|
+ (((int)blockIdx_y) / j_factors1 * 16 + (((int)threadIdx.y) * row_stride_warp) + ((int)threadIdx.x) / (32 / 8)) * IC
|
||||||
|
+ (((int)threadIdx.x) % (32 / 8)) * 8;
|
||||||
|
|
||||||
|
int* B_ptr = B
|
||||||
|
+ ((int)threadIdx.y) * (OC / 8) * 2
|
||||||
|
+ (((int)threadIdx.x) / (128 / 8)) * (OC / 8)
|
||||||
|
+ (((int)blockIdx_y) % j_factors1) * (128 / 8)
|
||||||
|
+ (((int)threadIdx.x) % (128 / 8)) * 1;
|
||||||
|
// Why * 1 in the above line?
|
||||||
|
|
||||||
|
half* A_shared_ptr = A_shared
|
||||||
|
+ ((int)threadIdx.y) * row_stride_warp * (32 + 8)
|
||||||
|
+ (((int)threadIdx.x) / (32 / 8)) * (32 + 8)
|
||||||
|
+ (((int)threadIdx.x) % (32 / 8) ) * 8;
|
||||||
|
|
||||||
|
half* B_shared_ptr = B_shared
|
||||||
|
+ ((int)threadIdx.y) * (row_stride / 2) * (128 + 8)
|
||||||
|
+ (((int)threadIdx.x) / (128 / 8)) * (128 + 8)
|
||||||
|
+ (((int)threadIdx.x) % (128 / 8)) * 8;
|
||||||
|
|
||||||
|
int* zeros_ptr = zeros
|
||||||
|
+ (((int)blockIdx_y) % j_factors1) * (128 / 8)
|
||||||
|
+ ((int)threadIdx.x) % (128 / 8);
|
||||||
|
|
||||||
|
half* scaling_factors_ptr = scaling_factors
|
||||||
|
+ (((int)blockIdx_y) % j_factors1) * (128)
|
||||||
|
+ (((int)threadIdx.x) % (128 / 8)) * 8;
|
||||||
|
|
||||||
|
half* C_ptr = C
|
||||||
|
+ static_cast<long long>(blockIdx_z) * M * OC // blockIdz.x -> split_k dim
|
||||||
|
+ (((int)blockIdx_y) % j_factors1) * 128
|
||||||
|
+ ((int)threadIdx.y) * 64
|
||||||
|
+ (((int)threadIdx.x) % 4) * 2;
|
||||||
|
|
||||||
|
// preload s.f. and zeros
|
||||||
|
int k_bound = (IC / 32 + split_k_iters - 1) / split_k_iters;
|
||||||
|
if ((k_bound - 1) * split_k_iters * 32 + blockIdx_z * 32 >= IC) k_bound -= 1;
|
||||||
|
for (int _k_0_0 = 0; _k_0_0 < k_bound; ++_k_0_0) {
|
||||||
|
int k_0_0 = _k_0_0 * split_k_iters + blockIdx_z;
|
||||||
|
__syncthreads();
|
||||||
|
// TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16
|
||||||
|
if (ld_A_flag)
|
||||||
|
{
|
||||||
|
*(uint4*)(A_shared_ptr) = *(uint4*)(A_ptr + (k_0_0 * 32));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
*(uint4*)(A_shared_ptr) = make_uint4(0, 0, 0, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 2; ++ax0_ax1_fused_0) {
|
||||||
|
uint32_t zeros_loaded = *(uint32_t*)(zeros_ptr + k_0_0 * 32 / G * (OC / 8));
|
||||||
|
uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded);
|
||||||
|
uint4 B_loaded_scale = *(uint4*)(scaling_factors_ptr + k_0_0 * 32 / G * (OC));
|
||||||
|
/*
|
||||||
|
if (blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 0 && threadIdx.y == 0){
|
||||||
|
printf("%x %x %x %x %x %x %x %x\n", B_loaded_scale.x, B_loaded_scale.y, B_loaded_scale.z, B_loaded_scale.w, B_loaded_zero.x, B_loaded_zero.y, B_loaded_zero.z, B_loaded_zero.w);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
// uint4 B_loaded_scale = make_uint4(0, 0, 0, 0);
|
||||||
|
int* B_ptr_local = B_ptr + k_0_0 * 32 * (OC / 8);
|
||||||
|
|
||||||
|
for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0) {
|
||||||
|
|
||||||
|
// B: 32 x 136 (128+8) float16
|
||||||
|
// each warp: 32 x 4
|
||||||
|
// each thr: read 32 bit -> convert to 8xFP16 (a UINT4) -> scale and minus zero -> WB UINT4
|
||||||
|
// *(uint4*)(B_shared + ((((ax0_ax1_fused_0 * 544) + (((int)threadIdx.y) * 272)) + ((((int)threadIdx.x) >> 4) * 136)) + ((((int)threadIdx.x) & 15) * 8))) = *(uint4*)(B + ((((((k_0_0 * 163840) + (ax0_ax1_fused_0 * 20480)) + (((int)threadIdx.y) * 10240)) + ((((int)threadIdx.x) >> 4) * 5120)) + (((int)blockIdx_y) * 128)) + ((((int)threadIdx.x) & 15) * 8)));
|
||||||
|
// row stride in shared memory: (NWARPS * 32 * 8 / cta_N)
|
||||||
|
uint32_t B_loaded = *(uint32_t*)(B_ptr_local + ax0_ax1_fused_0 * row_stride * (OC / 8));
|
||||||
|
uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
|
||||||
|
//uint4 B_loaded_zero = *(uint4*)(zeros_shared + (threadIdx.x % (cta_N / 8)) * 8);
|
||||||
|
|
||||||
|
// uint4 B_loaded_scale = *(uint4*)(scaling_factors_shared + (threadIdx.x % (cta_N / 8)) * 8);
|
||||||
|
// - zero and * scale
|
||||||
|
// TODO (Haotian): can save 4 assembly instructions if sormulate as deq = q * scale - zero * scale.
|
||||||
|
asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
|
||||||
|
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO));
|
||||||
|
asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y));
|
||||||
|
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO));
|
||||||
|
asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z));
|
||||||
|
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO));
|
||||||
|
asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w));
|
||||||
|
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
|
||||||
|
/*
|
||||||
|
if (ax0_ax1_fused_0 == 0 && blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 17 && threadIdx.y == 0){
|
||||||
|
printf("[x] %X %X %X %X\n", B_loaded_fp16.x, B_loaded_fp16.y, B_loaded_fp16.z, B_loaded_fp16.w);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
// write back
|
||||||
|
*(uint4*)(B_shared_ptr + ax0_ax1_fused_0 * row_stride * (128 + 8)) = B_loaded_fp16;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
for (int k_0_1 = 0; k_0_1 < 2; ++k_0_1) {
|
||||||
|
{
|
||||||
|
unsigned int addr;
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
|
||||||
|
: "=r"(addr)
|
||||||
|
: "l"((void *)((&(A_shared[(k_0_1 * 16)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8))))
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"ldmatrix.sync.aligned.m8n8.x4.shared.b16"
|
||||||
|
"{%0, %1, %2, %3}, [%4];\n"
|
||||||
|
: "=r"(((unsigned *)(A_shared_warp + 0))[0]), "=r"(((unsigned *)(A_shared_warp + 0))[1]), "=r"(((unsigned *)(A_shared_warp + 0))[2]), "=r"(((unsigned *)(A_shared_warp + 0))[3])
|
||||||
|
: "r"(addr)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int ax1_0 = 0; ax1_0 < 4; ++ax1_0) {
|
||||||
|
{
|
||||||
|
unsigned int addr;
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
|
||||||
|
: "=r"(addr)
|
||||||
|
: "l"((void *)((&(B_shared[(((k_0_1 * 2176) + (((int)threadIdx.y) * 64)) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 136) + ((((int)threadIdx.x) >> 4) * 8))))
|
||||||
|
);
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
|
||||||
|
"{%0, %1, %2, %3}, [%4];\n"
|
||||||
|
: "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[3])
|
||||||
|
: "r"(addr)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int j_0_4 = 0; j_0_4 < 4; ++j_0_4) {
|
||||||
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
|
||||||
|
{
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
||||||
|
"{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
|
||||||
|
: "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3])
|
||||||
|
: "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3]));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
||||||
|
"{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
|
||||||
|
: "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])
|
||||||
|
: "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
||||||
|
"{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
|
||||||
|
: "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3])
|
||||||
|
: "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3]));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
||||||
|
"{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
|
||||||
|
: "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])
|
||||||
|
: "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]));
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
{
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
|
||||||
|
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n"
|
||||||
|
: "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3])
|
||||||
|
: "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3]));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
|
||||||
|
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n"
|
||||||
|
: "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])
|
||||||
|
: "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]));
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: Shang: Hoist loop invariance.
|
||||||
|
for (int ax1_0_1 = 0; ax1_0_1 < 4; ++ax1_0_1) {
|
||||||
|
for (int local_id = 0; local_id < 8; ++local_id) {
|
||||||
|
int row_offset = (((int)blockIdx_y) / j_factors1) * 16 + ((int)threadIdx.x) / 4 + (local_id % 4) / 2 * 8;
|
||||||
|
if (row_offset < M)
|
||||||
|
{
|
||||||
|
*(C_ptr + ax1_0_1 * 16 + row_offset * OC + (local_id / 4) * 8 + local_id % 2) = __float2half(C_warp[(ax1_0_1 * 8) + local_id]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
__global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, int split_k_iters, half* __restrict__ A, int* __restrict__ B, half* __restrict__ scaling_factors, int* __restrict__ zeros, int M, int IC, int OC, half* __restrict__ C)
|
||||||
|
{
|
||||||
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
|
||||||
|
assert(false);
|
||||||
|
#else
|
||||||
|
static constexpr uint32_t ZERO = 0x0;
|
||||||
|
float C_warp[32];
|
||||||
|
__shared__ half A_shared[16 * (32 + 8)];
|
||||||
|
__shared__ half B_shared[32 * (64 + 8)];
|
||||||
|
|
||||||
|
__shared__ half scaling_factors_shared[64];
|
||||||
|
__shared__ half zeros_shared[64];
|
||||||
|
|
||||||
|
int j_factors1 = ((OC + 64 - 1) / 64);
|
||||||
|
|
||||||
|
int blockIdx_x = 0;
|
||||||
|
int blockIdx_y = blockIdx.x % ((M + 16 - 1) / 16 * j_factors1);
|
||||||
|
int blockIdx_z = blockIdx.x / ((M + 16 - 1) / 16 * j_factors1);
|
||||||
|
|
||||||
|
half A_shared_warp[8];
|
||||||
|
half B_shared_warp[16];
|
||||||
|
for (int j_0_4_init = 0; j_0_4_init < 2; ++j_0_4_init) {
|
||||||
|
for (int i = 0; i < 8; ++i) {
|
||||||
|
C_warp[(j_0_4_init * 8) + i] = 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr int row_stride_warp = 32 * 8 / 32;
|
||||||
|
static constexpr int row_stride = 2 * 32 * 8 / 64;
|
||||||
|
bool ld_zero_flag = (threadIdx.y * 32 + threadIdx.x) * 8 < 64;
|
||||||
|
// TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16
|
||||||
|
bool ld_A_flag = (blockIdx_y / j_factors1 * 16 + threadIdx.y * row_stride_warp + threadIdx.x * 8 / 32) < M; // threadIdx.y is warp_id
|
||||||
|
// bool wb_C_flag = (threadIdx.x / 4) < M;
|
||||||
|
|
||||||
|
half* A_ptr = A
|
||||||
|
+ (((int)blockIdx_y) / j_factors1 * 16 + (((int)threadIdx.y) * row_stride_warp) + ((int)threadIdx.x) / (32 / 8)) * IC
|
||||||
|
+ (((int)threadIdx.x) % (32 / 8)) * 8;
|
||||||
|
|
||||||
|
int* B_ptr = B
|
||||||
|
+ ((int)threadIdx.y) * (OC / 8) * 4
|
||||||
|
+ (((int)threadIdx.x) / (64 / 8)) * (OC / 8)
|
||||||
|
+ (((int)blockIdx_y) % j_factors1) * (64 / 8)
|
||||||
|
+ (((int)threadIdx.x) % (64 / 8)) * 1;
|
||||||
|
// Why * 1 in the above line?
|
||||||
|
|
||||||
|
half* A_shared_ptr = A_shared
|
||||||
|
+ ((int)threadIdx.y) * row_stride_warp * (32 + 8)
|
||||||
|
+ (((int)threadIdx.x) / (32 / 8)) * (32 + 8)
|
||||||
|
+ (((int)threadIdx.x) % (32 / 8) ) * 8;
|
||||||
|
|
||||||
|
half* B_shared_ptr = B_shared
|
||||||
|
+ ((int)threadIdx.y) * (row_stride / 2) * (64 + 8)
|
||||||
|
+ (((int)threadIdx.x) / (64 / 8)) * (64 + 8)
|
||||||
|
+ (((int)threadIdx.x) % (64 / 8)) * 8;
|
||||||
|
|
||||||
|
int* zeros_ptr = zeros
|
||||||
|
+ (((int)blockIdx_y) % j_factors1) * (64 / 8)
|
||||||
|
+ ((int)threadIdx.x) % (64 / 8);
|
||||||
|
|
||||||
|
half* scaling_factors_ptr = scaling_factors
|
||||||
|
+ (((int)blockIdx_y) % j_factors1) * (64)
|
||||||
|
+ (((int)threadIdx.x) % (64 / 8)) * 8;
|
||||||
|
|
||||||
|
half* C_ptr = C
|
||||||
|
+ static_cast<long long>(blockIdx_z) * M * OC // blockIdz.x -> split_k dim
|
||||||
|
+ (((int)blockIdx_y) % j_factors1) * 64
|
||||||
|
+ ((int)threadIdx.y) * 32
|
||||||
|
+ (((int)threadIdx.x) % 4) * 2;
|
||||||
|
|
||||||
|
// preload s.f. and zeros
|
||||||
|
int k_bound = (IC / 32 + split_k_iters - 1) / split_k_iters;
|
||||||
|
if ((k_bound - 1) * split_k_iters * 32 + blockIdx_z * 32 >= IC) k_bound -= 1;
|
||||||
|
for (int _k_0_0 = 0; _k_0_0 < k_bound; ++_k_0_0) {
|
||||||
|
int k_0_0 = _k_0_0 * split_k_iters + blockIdx_z;
|
||||||
|
__syncthreads();
|
||||||
|
// TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16
|
||||||
|
if (ld_A_flag)
|
||||||
|
{
|
||||||
|
*(uint4*)(A_shared_ptr) = *(uint4*)(A_ptr + (k_0_0 * 32));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
*(uint4*)(A_shared_ptr) = make_uint4(0, 0, 0, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 2; ++ax0_ax1_fused_0) {
|
||||||
|
uint32_t zeros_loaded = *(uint32_t*)(zeros_ptr + k_0_0 * 32 / G * (OC / 8));
|
||||||
|
uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded);
|
||||||
|
uint4 B_loaded_scale = *(uint4*)(scaling_factors_ptr + k_0_0 * 32 / G * (OC));
|
||||||
|
/*
|
||||||
|
if (blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 0 && threadIdx.y == 0){
|
||||||
|
printf("%x %x %x %x %x %x %x %x\n", B_loaded_scale.x, B_loaded_scale.y, B_loaded_scale.z, B_loaded_scale.w, B_loaded_zero.x, B_loaded_zero.y, B_loaded_zero.z, B_loaded_zero.w);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
// uint4 B_loaded_scale = make_uint4(0, 0, 0, 0);
|
||||||
|
int* B_ptr_local = B_ptr + k_0_0 * 32 * (OC / 8);
|
||||||
|
|
||||||
|
for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
|
||||||
|
|
||||||
|
// B: 32 x 136 (128+8) float16
|
||||||
|
// each warp: 32 x 4
|
||||||
|
// each thr: read 32 bit -> convert to 8xFP16 (a UINT4) -> scale and minus zero -> WB UINT4
|
||||||
|
// *(uint4*)(B_shared + ((((ax0_ax1_fused_0 * 544) + (((int)threadIdx.y) * 272)) + ((((int)threadIdx.x) >> 4) * 136)) + ((((int)threadIdx.x) & 15) * 8))) = *(uint4*)(B + ((((((k_0_0 * 163840) + (ax0_ax1_fused_0 * 20480)) + (((int)threadIdx.y) * 10240)) + ((((int)threadIdx.x) >> 4) * 5120)) + (((int)blockIdx_y) * 128)) + ((((int)threadIdx.x) & 15) * 8)));
|
||||||
|
// row stride in shared memory: (NWARPS * 32 * 8 / cta_N)
|
||||||
|
uint32_t B_loaded = *(uint32_t*)(B_ptr_local + ax0_ax1_fused_0 * row_stride * (OC / 8));
|
||||||
|
uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
|
||||||
|
//uint4 B_loaded_zero = *(uint4*)(zeros_shared + (threadIdx.x % (cta_N / 8)) * 8);
|
||||||
|
|
||||||
|
// uint4 B_loaded_scale = *(uint4*)(scaling_factors_shared + (threadIdx.x % (cta_N / 8)) * 8);
|
||||||
|
// - zero and * scale
|
||||||
|
// TODO (Haotian): can save 4 assembly instructions if sormulate as deq = q * scale - zero * scale.
|
||||||
|
asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
|
||||||
|
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO));
|
||||||
|
asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y));
|
||||||
|
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO));
|
||||||
|
asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z));
|
||||||
|
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO));
|
||||||
|
asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w));
|
||||||
|
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
|
||||||
|
/*
|
||||||
|
if (ax0_ax1_fused_0 == 0 && blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 17 && threadIdx.y == 0){
|
||||||
|
printf("[x] %X %X %X %X\n", B_loaded_fp16.x, B_loaded_fp16.y, B_loaded_fp16.z, B_loaded_fp16.w);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
// write back
|
||||||
|
*(uint4*)(B_shared_ptr + ax0_ax1_fused_0 * row_stride * (64 + 8)) = B_loaded_fp16;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
for (int k_0_1 = 0; k_0_1 < 2; ++k_0_1)
|
||||||
|
{
|
||||||
|
{
|
||||||
|
unsigned int addr;
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
|
||||||
|
: "=r"(addr)
|
||||||
|
: "l"((void *)((&(A_shared[(k_0_1 * 16)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8))))
|
||||||
|
);
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"ldmatrix.sync.aligned.m8n8.x4.shared.b16"
|
||||||
|
"{%0, %1, %2, %3}, [%4];\n"
|
||||||
|
: "=r"(((unsigned *)(A_shared_warp + 0))[0]), "=r"(((unsigned *)(A_shared_warp + 0))[1]), "=r"(((unsigned *)(A_shared_warp + 0))[2]), "=r"(((unsigned *)(A_shared_warp + 0))[3])
|
||||||
|
: "r"(addr)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0)
|
||||||
|
{
|
||||||
|
{
|
||||||
|
unsigned int addr;
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
|
||||||
|
: "=r"(addr)
|
||||||
|
: "l"((void *)((&(B_shared[(((k_0_1 * 1152) + (((int)threadIdx.y) * 32)) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 72) + ((((int)threadIdx.x) >> 4) * 8))))
|
||||||
|
);
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
|
||||||
|
"{%0, %1, %2, %3}, [%4];\n"
|
||||||
|
: "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[3])
|
||||||
|
: "r"(addr)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j_0_4 = 0; j_0_4 < 2; ++j_0_4)
|
||||||
|
{
|
||||||
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
|
||||||
|
{
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
||||||
|
"{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
|
||||||
|
: "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3])
|
||||||
|
: "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3]));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
||||||
|
"{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
|
||||||
|
: "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])
|
||||||
|
: "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
||||||
|
"{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
|
||||||
|
: "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3])
|
||||||
|
: "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3]));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
|
||||||
|
"{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
|
||||||
|
: "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])
|
||||||
|
: "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]));
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
{
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
|
||||||
|
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n"
|
||||||
|
: "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3])
|
||||||
|
: "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3]));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
|
||||||
|
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n"
|
||||||
|
: "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])
|
||||||
|
: "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: Shang: Hoist loop invariance.
|
||||||
|
for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) {
|
||||||
|
for (int local_id = 0; local_id < 8; ++local_id) {
|
||||||
|
int row_offset = (((int)blockIdx_y) / j_factors1) * 16 + ((int)threadIdx.x) / 4 + (local_id % 4) / 2 * 8;
|
||||||
|
if (row_offset < M)
|
||||||
|
{
|
||||||
|
*(C_ptr + ax1_0_1 * 16 + row_offset * OC + (local_id / 4) * 8 + local_id % 2) = __float2half(C_warp[(ax1_0_1 * 8) + local_id]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace awq
|
||||||
|
} // namespace vllm
|
||||||
|
|
||||||
|
// in_feats: M, IC [float16]
|
||||||
|
// kernel: IC, OC // 8 [int32] -> cast to IC, OC [uint4b]
|
||||||
|
// scaling_factors: IC // G, OC [float16]
|
||||||
|
// zeros: IC // G, OC // 8 [int32] -> cast to IC // G, OC [uint4b]
|
||||||
|
// assume that batch_size < 16 for now
|
||||||
|
|
||||||
|
torch::Tensor awq_gemm(
|
||||||
|
torch::Tensor _in_feats,
|
||||||
|
torch::Tensor _kernel,
|
||||||
|
torch::Tensor _scaling_factors,
|
||||||
|
torch::Tensor _zeros,
|
||||||
|
int split_k_iters)
|
||||||
|
{
|
||||||
|
int num_in_feats = _in_feats.size(0);
|
||||||
|
int num_in_channels = _in_feats.size(1);
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(_in_feats));
|
||||||
|
|
||||||
|
auto options = torch::TensorOptions().dtype(_in_feats.dtype()).device(_in_feats.device());
|
||||||
|
at::Tensor _out_feats = torch::empty({split_k_iters, num_in_feats, _kernel.size(1) * 8}, options);
|
||||||
|
int num_out_feats = _out_feats.size(-2);
|
||||||
|
int num_out_channels = _out_feats.size(-1);
|
||||||
|
|
||||||
|
auto in_feats = reinterpret_cast<half*>(_in_feats.data_ptr<at::Half>());
|
||||||
|
auto kernel = reinterpret_cast<int*>(_kernel.data_ptr<int>());
|
||||||
|
auto out_feats = reinterpret_cast<half*>(_out_feats.data_ptr<at::Half>());
|
||||||
|
auto scaling_factors = reinterpret_cast<half*>(_scaling_factors.data_ptr<at::Half>());
|
||||||
|
auto zeros = reinterpret_cast<int*>(_zeros.data_ptr<int>());
|
||||||
|
int group_size = num_in_channels / _scaling_factors.size(0);
|
||||||
|
|
||||||
|
if (num_out_channels % 64 != 0)
|
||||||
|
throw std::invalid_argument("OC is not multiple of cta_N = 64");
|
||||||
|
if (num_out_channels % 8 != 0)
|
||||||
|
throw std::invalid_argument("OC is not multiple of pack_num = 8");
|
||||||
|
if (group_size % 32 != 0)
|
||||||
|
throw std::invalid_argument("Group size should be a multiple of 32");
|
||||||
|
if (num_out_channels % group_size != 0)
|
||||||
|
throw std::invalid_argument("OC is not multiple of Group size");
|
||||||
|
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
if (num_out_channels % 128 == 0)
|
||||||
|
{
|
||||||
|
int j_factors1 = num_out_channels / 128 / 1;
|
||||||
|
dim3 num_blocks((num_out_feats + 16 - 1) / 16 * j_factors1 * split_k_iters);
|
||||||
|
// threadIdx.x: 32
|
||||||
|
// threadIdx.y: i_factors[2] * j_factors[2]
|
||||||
|
dim3 threads_per_block(32, 2);
|
||||||
|
vllm::awq::gemm_forward_4bit_cuda_m16n128k32<<<num_blocks, threads_per_block, 0, stream>>>(
|
||||||
|
group_size, split_k_iters, in_feats, kernel, scaling_factors, zeros, num_in_feats, num_in_channels, num_out_channels, out_feats);
|
||||||
|
}
|
||||||
|
else if (num_out_channels % 64 == 0)
|
||||||
|
{
|
||||||
|
int j_factors1 = num_out_channels / 64 / 1;
|
||||||
|
dim3 num_blocks(1 * (num_out_feats + 16 - 1) / 16 * j_factors1 * split_k_iters);
|
||||||
|
|
||||||
|
// threadIdx.x: 32
|
||||||
|
// threadIdx.y: i_factors[2] * j_factors[2]
|
||||||
|
dim3 threads_per_block(32, 2);
|
||||||
|
vllm::awq::gemm_forward_4bit_cuda_m16n64k32<<<num_blocks, threads_per_block, 0, stream>>>(
|
||||||
|
group_size, split_k_iters, in_feats, kernel, scaling_factors, zeros, num_in_feats, num_in_channels, num_out_channels, out_feats);
|
||||||
|
}
|
||||||
|
return _out_feats.sum(0);
|
||||||
|
}
|
||||||
148
csrc/quantization/squeezellm/quant_cuda_kernel.cu
Normal file
@@ -0,0 +1,148 @@
|
|||||||
|
#include <torch/all.h>
|
||||||
|
#include <torch/python.h>
|
||||||
|
#include <cuda.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
|
||||||
|
// half-tensor
|
||||||
|
#include <c10/cuda/CUDAStream.h>
|
||||||
|
#include <ATen/cuda/CUDATensorMethods.cuh>
|
||||||
|
|
||||||
|
#define BLOCKWIDTH 128
|
||||||
|
#define BLOCKHEIGHT4 16
|
||||||
|
|
||||||
|
namespace vllm {
|
||||||
|
namespace squeezellm {
|
||||||
|
|
||||||
|
__device__ inline unsigned int as_unsigned(int i) {
|
||||||
|
return *reinterpret_cast<unsigned int*>(&i);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4-bit matvec kernel (LUT-based)
|
||||||
|
__global__ void NUQ4MatMulKernel(
|
||||||
|
const half2* __restrict__ vec,
|
||||||
|
const int* __restrict__ mat,
|
||||||
|
half2* __restrict__ mul,
|
||||||
|
const __half* __restrict__ lookup_table,
|
||||||
|
int height,
|
||||||
|
int width,
|
||||||
|
int batch,
|
||||||
|
int vec_height
|
||||||
|
) {
|
||||||
|
|
||||||
|
const int blockwidth2 = BLOCKWIDTH / 2;
|
||||||
|
|
||||||
|
int row = BLOCKHEIGHT4 * blockIdx.x;
|
||||||
|
int col = BLOCKWIDTH * blockIdx.y + threadIdx.x;
|
||||||
|
|
||||||
|
__shared__ half2 blockvec[blockwidth2];
|
||||||
|
|
||||||
|
__shared__ __half deq2[16][BLOCKWIDTH];
|
||||||
|
int off = threadIdx.x;
|
||||||
|
int column_offset = col * 16;
|
||||||
|
for (int val = 0; val < 16; val += 1) {
|
||||||
|
int lut_index = column_offset + val;
|
||||||
|
deq2[val][off] = lookup_table[lut_index];
|
||||||
|
}
|
||||||
|
|
||||||
|
__half res;
|
||||||
|
half2 res2;
|
||||||
|
half2 tmp2;
|
||||||
|
|
||||||
|
int i;
|
||||||
|
int k;
|
||||||
|
|
||||||
|
unsigned int tmp1;
|
||||||
|
unsigned int lut_index1, lut_index2;
|
||||||
|
|
||||||
|
for (int b = 0; b < batch; ++b){
|
||||||
|
i = width * row + col;
|
||||||
|
res = __int2half_rd(0);
|
||||||
|
k = 0;
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
if (threadIdx.x < blockwidth2)
|
||||||
|
blockvec[threadIdx.x] = vec[b * vec_height / 2 + (row / BLOCKHEIGHT4) * blockwidth2 + threadIdx.x];
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
while (k < blockwidth2) {
|
||||||
|
tmp1 = as_unsigned(mat[i]);
|
||||||
|
|
||||||
|
res2 = {};
|
||||||
|
tmp2 = {};
|
||||||
|
|
||||||
|
lut_index1 = tmp1 & 0xF;
|
||||||
|
lut_index2 = (tmp1 >> 4) & 0xF;
|
||||||
|
tmp2.x = deq2[lut_index1][off];
|
||||||
|
tmp2.y = deq2[lut_index2][off];
|
||||||
|
res2 = __hfma2(tmp2, blockvec[k + 0], res2);
|
||||||
|
|
||||||
|
lut_index1 = (tmp1 >> 8) & 0xF;
|
||||||
|
lut_index2 = (tmp1 >> 12) & 0xF;
|
||||||
|
tmp2.x = deq2[lut_index1][off];
|
||||||
|
tmp2.y = deq2[lut_index2][off];
|
||||||
|
res2 = __hfma2(tmp2, blockvec[k + 1], res2);
|
||||||
|
|
||||||
|
lut_index1 = (tmp1 >> 16) & 0xF;
|
||||||
|
lut_index2 = (tmp1 >> 20) & 0xF;
|
||||||
|
tmp2.x = deq2[lut_index1][off];
|
||||||
|
tmp2.y = deq2[lut_index2][off];
|
||||||
|
res2 = __hfma2(tmp2, blockvec[k + 2], res2);
|
||||||
|
|
||||||
|
lut_index1 = (tmp1 >> 24) & 0xF;
|
||||||
|
lut_index2 = (tmp1 >> 28) & 0xF;
|
||||||
|
tmp2.x = deq2[lut_index1][off];
|
||||||
|
tmp2.y = deq2[lut_index2][off];
|
||||||
|
res2 = __hfma2(tmp2, blockvec[k + 3], res2);
|
||||||
|
|
||||||
|
res = __hadd(__hadd(res2.x, res2.y), res);
|
||||||
|
|
||||||
|
i += width;
|
||||||
|
k += 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
// col%2 -> only set one of the two values
|
||||||
|
half2 res3 = {};
|
||||||
|
if (col % 2 == 0) {
|
||||||
|
res3.x = res;
|
||||||
|
} else {
|
||||||
|
res3.y = res;
|
||||||
|
}
|
||||||
|
|
||||||
|
atomicAdd(&mul[b * width / 2 + col / 2], res3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace squeezellm
|
||||||
|
} // namespace vllm
|
||||||
|
|
||||||
|
// 4-bit matvec kernel (LUT-based)
|
||||||
|
void squeezellm_gemm(
|
||||||
|
torch::Tensor vec,
|
||||||
|
torch::Tensor mat,
|
||||||
|
torch::Tensor mul,
|
||||||
|
torch::Tensor lookup_table
|
||||||
|
) {
|
||||||
|
int height = mat.size(0);
|
||||||
|
int width = mat.size(1);
|
||||||
|
|
||||||
|
int batch = vec.size(0);
|
||||||
|
int vec_height = vec.size(1);
|
||||||
|
|
||||||
|
dim3 blocks(
|
||||||
|
(height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4,
|
||||||
|
(width + BLOCKWIDTH - 1) / BLOCKWIDTH
|
||||||
|
);
|
||||||
|
dim3 threads(BLOCKWIDTH);
|
||||||
|
|
||||||
|
vllm::squeezellm::NUQ4MatMulKernel<<<blocks, threads>>>(
|
||||||
|
(half2*) vec.data<at::Half>(),
|
||||||
|
mat.data_ptr<int>(),
|
||||||
|
(half2*) mul.data<at::Half>(),
|
||||||
|
(__half*) lookup_table.data<at::Half>(),
|
||||||
|
height, width, batch, vec_height
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef BLOCKWIDTH
|
||||||
|
#undef BLOCKHEIGHT4
|
||||||
|
Before Width: | Height: | Size: 267 KiB |
|
Before Width: | Height: | Size: 285 KiB |
|
Before Width: | Height: | Size: 259 KiB |
|
Before Width: | Height: | Size: 276 KiB |
|
Before Width: | Height: | Size: 244 KiB |
|
Before Width: | Height: | Size: 260 KiB |
|
Before Width: | Height: | Size: 255 KiB |
|
Before Width: | Height: | Size: 272 KiB |
@@ -3,30 +3,14 @@
|
|||||||
Installation
|
Installation
|
||||||
============
|
============
|
||||||
|
|
||||||
vLLM is a Python library that also contains some C++ and CUDA code.
|
vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries.
|
||||||
This additional code requires compilation on the user's machine.
|
|
||||||
|
|
||||||
Requirements
|
Requirements
|
||||||
------------
|
------------
|
||||||
|
|
||||||
* OS: Linux
|
* OS: Linux
|
||||||
* Python: 3.8 or higher
|
* Python: 3.8 -- 3.11
|
||||||
* CUDA: 11.0 -- 11.8
|
* GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
|
||||||
* GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, etc.)
|
|
||||||
|
|
||||||
.. note::
|
|
||||||
As of now, vLLM does not support CUDA 12.
|
|
||||||
If you are using Hopper or Lovelace GPUs, please use CUDA 11.8 instead of CUDA 12.
|
|
||||||
|
|
||||||
.. tip::
|
|
||||||
If you have trouble installing vLLM, we recommend using the NVIDIA PyTorch Docker image.
|
|
||||||
|
|
||||||
.. code-block:: console
|
|
||||||
|
|
||||||
$ # Pull the Docker image with CUDA 11.8.
|
|
||||||
$ docker run --gpus all -it --rm --shm-size=8g nvcr.io/nvidia/pytorch:22.12-py3
|
|
||||||
|
|
||||||
Inside the Docker container, please execute :code:`pip uninstall torch` before installing vLLM.
|
|
||||||
|
|
||||||
Install with pip
|
Install with pip
|
||||||
----------------
|
----------------
|
||||||
@@ -39,8 +23,23 @@ You can install vLLM using pip:
|
|||||||
$ conda create -n myenv python=3.8 -y
|
$ conda create -n myenv python=3.8 -y
|
||||||
$ conda activate myenv
|
$ conda activate myenv
|
||||||
|
|
||||||
$ # Install vLLM.
|
$ # Install vLLM with CUDA 12.1.
|
||||||
$ pip install vllm # This may take 5-10 minutes.
|
$ pip install vllm
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
As of now, vLLM's binaries are compiled on CUDA 12.1 by default.
|
||||||
|
However, you can install vLLM with CUDA 11.8 by running:
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ # Install vLLM with CUDA 11.8.
|
||||||
|
$ # Replace `cp310` with your Python version (e.g., `cp38`, `cp39`, `cp311`).
|
||||||
|
$ pip install https://github.com/vllm-project/vllm/releases/download/v0.2.2/vllm-0.2.2+cu118-cp310-cp310-manylinux1_x86_64.whl
|
||||||
|
|
||||||
|
$ # Re-install PyTorch with CUDA 11.8.
|
||||||
|
$ pip uninstall torch -y
|
||||||
|
$ pip install torch --upgrade --index-url https://download.pytorch.org/whl/cu118
|
||||||
|
|
||||||
|
|
||||||
.. _build_from_source:
|
.. _build_from_source:
|
||||||
@@ -55,3 +54,11 @@ You can also build and install vLLM from source:
|
|||||||
$ git clone https://github.com/vllm-project/vllm.git
|
$ git clone https://github.com/vllm-project/vllm.git
|
||||||
$ cd vllm
|
$ cd vllm
|
||||||
$ pip install -e . # This may take 5-10 minutes.
|
$ pip install -e . # This may take 5-10 minutes.
|
||||||
|
|
||||||
|
.. tip::
|
||||||
|
If you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ # Use `--ipc=host` to make sure the shared memory is large enough.
|
||||||
|
$ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
|
||||||
|
|||||||
@@ -40,6 +40,16 @@ Initialize vLLM's engine for offline inference with the ``LLM`` class and the `O
|
|||||||
|
|
||||||
llm = LLM(model="facebook/opt-125m")
|
llm = LLM(model="facebook/opt-125m")
|
||||||
|
|
||||||
|
Use model from www.modelscope.cn
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
export VLLM_USE_MODELSCOPE=True
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
llm = LLM(model="qwen/Qwen-7B-Chat", revision="v1.1.8", trust_remote_code=True)
|
||||||
|
|
||||||
Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all the output tokens.
|
Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all the output tokens.
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
@@ -67,6 +77,16 @@ Start the server:
|
|||||||
|
|
||||||
$ python -m vllm.entrypoints.api_server
|
$ python -m vllm.entrypoints.api_server
|
||||||
|
|
||||||
|
Use model from www.modelscope.cn
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ VLLM_USE_MODELSCOPE=True python -m vllm.entrypoints.api_server \
|
||||||
|
$ --model="qwen/Qwen-7B-Chat" \
|
||||||
|
$ --revision="v1.1.8" \
|
||||||
|
$ --trust-remote-code
|
||||||
|
|
||||||
|
|
||||||
By default, this command starts the server at ``http://localhost:8000`` with the OPT-125M model.
|
By default, this command starts the server at ``http://localhost:8000`` with the OPT-125M model.
|
||||||
|
|
||||||
Query the model in shell:
|
Query the model in shell:
|
||||||
@@ -87,6 +107,7 @@ OpenAI-Compatible Server
|
|||||||
------------------------
|
------------------------
|
||||||
|
|
||||||
vLLM can be deployed as a server that mimics the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API.
|
vLLM can be deployed as a server that mimics the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API.
|
||||||
|
By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the above command) and implements `list models <https://platform.openai.com/docs/api-reference/models/list>`_, `create chat completion <https://platform.openai.com/docs/api-reference/chat/completions/create>`_, and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. We are actively adding support for more endpoints.
|
||||||
|
|
||||||
Start the server:
|
Start the server:
|
||||||
|
|
||||||
@@ -95,7 +116,20 @@ Start the server:
|
|||||||
$ python -m vllm.entrypoints.openai.api_server \
|
$ python -m vllm.entrypoints.openai.api_server \
|
||||||
$ --model facebook/opt-125m
|
$ --model facebook/opt-125m
|
||||||
|
|
||||||
By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the above command) and implements `list models <https://platform.openai.com/docs/api-reference/models/list>`_ and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. We are actively adding support for more endpoints.
|
Use model from www.modelscope.cn
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ VLLM_USE_MODELSCOPE=True python -m vllm.entrypoints.openai.api_server \
|
||||||
|
$ --model="qwen/Qwen-7B-Chat" --revision="v1.1.8" --trust-remote-code
|
||||||
|
|
||||||
|
By default, the server uses a predefined chat template stored in the tokenizer. You can override this template by using the ``--chat-template`` argument:
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ python -m vllm.entrypoints.openai.api_server \
|
||||||
|
$ --model facebook/opt-125m \
|
||||||
|
$ --chat-template ./examples/template_chatml.json
|
||||||
|
|
||||||
This server can be queried in the same format as OpenAI API. For example, list the models:
|
This server can be queried in the same format as OpenAI API. For example, list the models:
|
||||||
|
|
||||||
@@ -103,6 +137,9 @@ This server can be queried in the same format as OpenAI API. For example, list t
|
|||||||
|
|
||||||
$ curl http://localhost:8000/v1/models
|
$ curl http://localhost:8000/v1/models
|
||||||
|
|
||||||
|
Using OpenAI Completions API with vLLM
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
Query the model with input prompts:
|
Query the model with input prompts:
|
||||||
|
|
||||||
.. code-block:: console
|
.. code-block:: console
|
||||||
@@ -120,12 +157,65 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
|
|||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
import openai
|
from openai import OpenAI
|
||||||
|
|
||||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||||
openai.api_key = "EMPTY"
|
openai_api_key = "EMPTY"
|
||||||
openai.api_base = "http://localhost:8000/v1"
|
openai_api_base = "http://localhost:8000/v1"
|
||||||
completion = openai.Completion.create(model="facebook/opt-125m",
|
client = OpenAI(
|
||||||
|
api_key=openai_api_key,
|
||||||
|
base_url=openai_api_base,
|
||||||
|
)
|
||||||
|
completion = client.completions.create(model="facebook/opt-125m",
|
||||||
prompt="San Francisco is a")
|
prompt="San Francisco is a")
|
||||||
print("Completion result:", completion)
|
print("Completion result:", completion)
|
||||||
|
|
||||||
For a more detailed client example, refer to `examples/openai_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_client.py>`_.
|
For a more detailed client example, refer to `examples/openai_completion_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py>`_.
|
||||||
|
|
||||||
|
Using OpenAI Chat API with vLLM
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The vLLM server is designed to support the OpenAI Chat API, allowing you to engage in dynamic conversations with the model. The chat interface is a more interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations.
|
||||||
|
|
||||||
|
Querying the model using OpenAI Chat API:
|
||||||
|
|
||||||
|
You can use the `create chat completion <https://platform.openai.com/docs/api-reference/chat/completions/create>`_ endpoint to communicate with the model in a chat-like interface:
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ curl http://localhost:8000/v1/chat/completions \
|
||||||
|
$ -H "Content-Type: application/json" \
|
||||||
|
$ -d '{
|
||||||
|
$ "model": "facebook/opt-125m",
|
||||||
|
$ "messages": [
|
||||||
|
$ {"role": "system", "content": "You are a helpful assistant."},
|
||||||
|
$ {"role": "user", "content": "Who won the world series in 2020?"}
|
||||||
|
$ ]
|
||||||
|
$ }'
|
||||||
|
|
||||||
|
Python Client Example:
|
||||||
|
|
||||||
|
Using the `openai` python package, you can also communicate with the model in a chat-like manner:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from openai import OpenAI
|
||||||
|
# Set OpenAI's API key and API base to use vLLM's API server.
|
||||||
|
openai_api_key = "EMPTY"
|
||||||
|
openai_api_base = "http://localhost:8000/v1"
|
||||||
|
|
||||||
|
client = OpenAI(
|
||||||
|
api_key=openai_api_key,
|
||||||
|
base_url=openai_api_base,
|
||||||
|
)
|
||||||
|
|
||||||
|
chat_response = client.chat.completions.create(
|
||||||
|
model="facebook/opt-125m",
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": "You are a helpful assistant."},
|
||||||
|
{"role": "user", "content": "Tell me a joke."},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
print("Chat response:", chat_response)
|
||||||
|
|
||||||
|
For more in-depth examples and advanced features of the chat API, you can refer to the official OpenAI documentation.
|
||||||
|
|||||||
@@ -43,6 +43,7 @@ vLLM is flexible and easy to use with:
|
|||||||
For more information, check out the following:
|
For more information, check out the following:
|
||||||
|
|
||||||
* `vLLM announcing blog post <https://vllm.ai>`_ (intro to PagedAttention)
|
* `vLLM announcing blog post <https://vllm.ai>`_ (intro to PagedAttention)
|
||||||
|
* `vLLM paper <https://arxiv.org/abs/2309.06180>`_ (SOSP 2023)
|
||||||
* `How continuous batching enables 23x throughput in LLM inference while reducing p50 latency <https://www.anyscale.com/blog/continuous-batching-llm-inference>`_ by Cade Daniel et al.
|
* `How continuous batching enables 23x throughput in LLM inference while reducing p50 latency <https://www.anyscale.com/blog/continuous-batching-llm-inference>`_ by Cade Daniel et al.
|
||||||
|
|
||||||
|
|
||||||
@@ -63,6 +64,10 @@ Documentation
|
|||||||
|
|
||||||
serving/distributed_serving
|
serving/distributed_serving
|
||||||
serving/run_on_sky
|
serving/run_on_sky
|
||||||
|
serving/deploying_with_triton
|
||||||
|
serving/deploying_with_docker
|
||||||
|
serving/serving_with_langchain
|
||||||
|
serving/metrics
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
@@ -70,3 +75,10 @@ Documentation
|
|||||||
|
|
||||||
models/supported_models
|
models/supported_models
|
||||||
models/adding_model
|
models/adding_model
|
||||||
|
models/engine_args
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
:caption: Quantization
|
||||||
|
|
||||||
|
quantization/auto_awq
|
||||||
@@ -18,7 +18,7 @@ This document provides a high-level guide on integrating a `HuggingFace Transfor
|
|||||||
0. Fork the vLLM repository
|
0. Fork the vLLM repository
|
||||||
--------------------------------
|
--------------------------------
|
||||||
|
|
||||||
Start by forking our `GitHub <https://github.com/vllm-project/vllm/>`_ repository and then :ref:`build it from source <build_from_source>`.
|
Start by forking our `GitHub`_ repository and then :ref:`build it from source <build_from_source>`.
|
||||||
This gives you the ability to modify the codebase and test your model.
|
This gives you the ability to modify the codebase and test your model.
|
||||||
|
|
||||||
|
|
||||||
@@ -62,31 +62,34 @@ Next, you need to rewrite the :code:`forward` methods of your model by following
|
|||||||
+) -> SamplerOutput:
|
+) -> SamplerOutput:
|
||||||
|
|
||||||
3. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors.
|
3. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors.
|
||||||
4. Replace the attention operation with either :code:`GPTPagedAttention` or :code:`GPTNeoXPagedAttention`, depending on the model's architecture.
|
4. Replace the attention operation with either :code:`PagedAttention`, :code:`PagedAttentionWithRoPE`, or :code:`PagedAttentionWithALiBi` depending on the model's architecture.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
|
Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
|
||||||
If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
|
If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
|
||||||
|
|
||||||
|
|
||||||
3. (Optional) Implement tensor parallelism support
|
3. (Optional) Implement tensor parallelism and quantization support
|
||||||
--------------------------------------------------
|
-------------------------------------------------------------------
|
||||||
|
|
||||||
If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
|
If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
|
||||||
To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
|
To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
|
||||||
For the embedding layer, you can simply replace :code:`nn.Embedding` with :code:`VocabParallelEmbedding`.
|
For the embedding layer, you can simply replace :code:`nn.Embedding` with :code:`VocabParallelEmbedding`. For the output LM head, you can use :code:`ParallelLMHead`.
|
||||||
When it comes to the linear layers, you should use either :code:`RowParallelLinear` or :code:`ColumnParallelLinear`.
|
When it comes to the linear layers, we provide the following options to parallelize them:
|
||||||
Typically, :code:`ColumnParallelLinear` is used for QKV linear layers and the first linear layers of the MLP blocks.
|
|
||||||
For the remaining linear layers, :code:`RowParallelLinear` is used.
|
|
||||||
|
|
||||||
|
* :code:`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
|
||||||
|
* :code:`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer.
|
||||||
|
* :code:`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer.
|
||||||
|
* :code:`MergedColumnParallelLinear`: Column-parallel linear that merges multiple `ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
|
||||||
|
* :code:`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices.
|
||||||
|
|
||||||
|
Note that all the linear layers above take `linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
|
||||||
|
|
||||||
4. Implement the weight loading logic
|
4. Implement the weight loading logic
|
||||||
-------------------------------------
|
-------------------------------------
|
||||||
|
|
||||||
You now need to implement the :code:`load_weights` method in your :code:`*ForCausalLM` class.
|
You now need to implement the :code:`load_weights` method in your :code:`*ForCausalLM` class.
|
||||||
This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model.
|
This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for `MergedColumnParallelLinear` and `QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
|
||||||
While the process is straightforward for most layers, the tensor-parallel layers necessitate some additional care as their weights should be partitioned to multiple GPUs.
|
|
||||||
|
|
||||||
|
|
||||||
5. Register your model
|
5. Register your model
|
||||||
----------------------
|
----------------------
|
||||||
|
|||||||
114
docs/source/models/engine_args.rst
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
.. _engine_args:
|
||||||
|
|
||||||
|
Engine Arguments
|
||||||
|
================
|
||||||
|
|
||||||
|
Below, you can find an explanation of every engine argument for vLLM:
|
||||||
|
|
||||||
|
.. option:: --model <model_name_or_path>
|
||||||
|
|
||||||
|
Name or path of the huggingface model to use.
|
||||||
|
|
||||||
|
.. option:: --tokenizer <tokenizer_name_or_path>
|
||||||
|
|
||||||
|
Name or path of the huggingface tokenizer to use.
|
||||||
|
|
||||||
|
.. option:: --revision <revision>
|
||||||
|
|
||||||
|
The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
|
||||||
|
|
||||||
|
.. option:: --tokenizer-revision <revision>
|
||||||
|
|
||||||
|
The specific tokenizer version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
|
||||||
|
|
||||||
|
.. option:: --tokenizer-mode {auto,slow}
|
||||||
|
|
||||||
|
The tokenizer mode.
|
||||||
|
|
||||||
|
* "auto" will use the fast tokenizer if available.
|
||||||
|
* "slow" will always use the slow tokenizer.
|
||||||
|
|
||||||
|
.. option:: --trust-remote-code
|
||||||
|
|
||||||
|
Trust remote code from huggingface.
|
||||||
|
|
||||||
|
.. option:: --download-dir <directory>
|
||||||
|
|
||||||
|
Directory to download and load the weights, default to the default cache dir of huggingface.
|
||||||
|
|
||||||
|
.. option:: --load-format {auto,pt,safetensors,npcache,dummy}
|
||||||
|
|
||||||
|
The format of the model weights to load.
|
||||||
|
|
||||||
|
* "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is not available.
|
||||||
|
* "pt" will load the weights in the pytorch bin format.
|
||||||
|
* "safetensors" will load the weights in the safetensors format.
|
||||||
|
* "npcache" will load the weights in pytorch format and store a numpy cache to speed up the loading.
|
||||||
|
* "dummy" will initialize the weights with random values, mainly for profiling.
|
||||||
|
|
||||||
|
.. option:: --dtype {auto,half,float16,bfloat16,float,float32}
|
||||||
|
|
||||||
|
Data type for model weights and activations.
|
||||||
|
|
||||||
|
* "auto" will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models.
|
||||||
|
* "half" for FP16. Recommended for AWQ quantization.
|
||||||
|
* "float16" is the same as "half".
|
||||||
|
* "bfloat16" for a balance between precision and range.
|
||||||
|
* "float" is shorthand for FP32 precision.
|
||||||
|
* "float32" for FP32 precision.
|
||||||
|
|
||||||
|
.. option:: --max-model-len <length>
|
||||||
|
|
||||||
|
Model context length. If unspecified, will be automatically derived from the model config.
|
||||||
|
|
||||||
|
.. option:: --worker-use-ray
|
||||||
|
|
||||||
|
Use Ray for distributed serving, will be automatically set when using more than 1 GPU.
|
||||||
|
|
||||||
|
.. option:: --pipeline-parallel-size (-pp) <size>
|
||||||
|
|
||||||
|
Number of pipeline stages.
|
||||||
|
|
||||||
|
.. option:: --tensor-parallel-size (-tp) <size>
|
||||||
|
|
||||||
|
Number of tensor parallel replicas.
|
||||||
|
|
||||||
|
.. option:: --max-parallel-loading-workers <workers>
|
||||||
|
|
||||||
|
Load model sequentially in multiple batches, to avoid RAM OOM when using tensor parallel and large models.
|
||||||
|
|
||||||
|
.. option:: --block-size {8,16,32}
|
||||||
|
|
||||||
|
Token block size for contiguous chunks of tokens.
|
||||||
|
|
||||||
|
.. option:: --seed <seed>
|
||||||
|
|
||||||
|
Random seed for operations.
|
||||||
|
|
||||||
|
.. option:: --swap-space <size>
|
||||||
|
|
||||||
|
CPU swap space size (GiB) per GPU.
|
||||||
|
|
||||||
|
.. option:: --gpu-memory-utilization <percentage>
|
||||||
|
|
||||||
|
The percentage of GPU memory to be used for the model executor.
|
||||||
|
|
||||||
|
.. option:: --max-num-batched-tokens <tokens>
|
||||||
|
|
||||||
|
Maximum number of batched tokens per iteration.
|
||||||
|
|
||||||
|
.. option:: --max-num-seqs <sequences>
|
||||||
|
|
||||||
|
Maximum number of sequences per iteration.
|
||||||
|
|
||||||
|
.. option:: --max-paddings <paddings>
|
||||||
|
|
||||||
|
Maximum number of paddings in a batch.
|
||||||
|
|
||||||
|
.. option:: --disable-log-stats
|
||||||
|
|
||||||
|
Disable logging statistics.
|
||||||
|
|
||||||
|
.. option:: --quantization (-q) {awq,squeezellm,None}
|
||||||
|
|
||||||
|
Method used to quantize the weights.
|
||||||
@@ -19,13 +19,16 @@ Alongside each architecture, we include some popular models that use it.
|
|||||||
- :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc.
|
- :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc.
|
||||||
* - :code:`BaiChuanForCausalLM`
|
* - :code:`BaiChuanForCausalLM`
|
||||||
- Baichuan
|
- Baichuan
|
||||||
- :code:`baichuan-inc/Baichuan-7B`, :code:`baichuan-inc/Baichuan-13B-Chat`, etc.
|
- :code:`baichuan-inc/Baichuan2-13B-Chat`, :code:`baichuan-inc/Baichuan-7B`, etc.
|
||||||
|
* - :code:`ChatGLMModel`
|
||||||
|
- ChatGLM
|
||||||
|
- :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc.
|
||||||
* - :code:`BloomForCausalLM`
|
* - :code:`BloomForCausalLM`
|
||||||
- BLOOM, BLOOMZ, BLOOMChat
|
- BLOOM, BLOOMZ, BLOOMChat
|
||||||
- :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc.
|
- :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc.
|
||||||
* - :code:`FalconForCausalLM`
|
* - :code:`FalconForCausalLM`
|
||||||
- Falcon
|
- Falcon
|
||||||
- :code:`tiiuae/falcon-7b``, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
|
- :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
|
||||||
* - :code:`GPT2LMHeadModel`
|
* - :code:`GPT2LMHeadModel`
|
||||||
- GPT-2
|
- GPT-2
|
||||||
- :code:`gpt2`, :code:`gpt2-xl`, etc.
|
- :code:`gpt2`, :code:`gpt2-xl`, etc.
|
||||||
@@ -44,15 +47,24 @@ Alongside each architecture, we include some popular models that use it.
|
|||||||
* - :code:`LlamaForCausalLM`
|
* - :code:`LlamaForCausalLM`
|
||||||
- LLaMA, LLaMA-2, Vicuna, Alpaca, Koala, Guanaco
|
- LLaMA, LLaMA-2, Vicuna, Alpaca, Koala, Guanaco
|
||||||
- :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`young-geng/koala`, etc.
|
- :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`young-geng/koala`, etc.
|
||||||
|
* - :code:`MistralForCausalLM`
|
||||||
|
- Mistral, Mistral-Instruct
|
||||||
|
- :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc.
|
||||||
* - :code:`MPTForCausalLM`
|
* - :code:`MPTForCausalLM`
|
||||||
- MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
|
- MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
|
||||||
- :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc.
|
- :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc.
|
||||||
* - :code:`OPTForCausalLM`
|
* - :code:`OPTForCausalLM`
|
||||||
- OPT, OPT-IML
|
- OPT, OPT-IML
|
||||||
- :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.
|
- :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.
|
||||||
|
* - :code:`PhiForCausalLM`
|
||||||
|
- Phi-1.5
|
||||||
|
- :code:`microsoft/phi-1_5`, etc.
|
||||||
* - :code:`QWenLMHeadModel`
|
* - :code:`QWenLMHeadModel`
|
||||||
- Qwen
|
- Qwen
|
||||||
- :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
|
- :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
|
||||||
|
* - :code:`YiForCausalLM`
|
||||||
|
- Yi
|
||||||
|
- :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
|
||||||
|
|
||||||
If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
|
If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
|
||||||
Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` for instructions on how to implement support for your model.
|
Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` for instructions on how to implement support for your model.
|
||||||
@@ -69,4 +81,18 @@ Alternatively, you can raise an issue on our `GitHub <https://github.com/vllm-pr
|
|||||||
output = llm.generate("Hello, my name is")
|
output = llm.generate("Hello, my name is")
|
||||||
print(output)
|
print(output)
|
||||||
|
|
||||||
|
To use model from www.modelscope.cn
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
$ export VLLM_USE_MODELSCOPE=True
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from vllm import LLM
|
||||||
|
|
||||||
|
llm = LLM(model=..., revision=..., trust_remote_code=True) # Name or path of your model
|
||||||
|
output = llm.generate("Hello, my name is")
|
||||||
|
print(output)
|
||||||
|
|
||||||
If vLLM successfully generates text, it indicates that your model is supported.
|
If vLLM successfully generates text, it indicates that your model is supported.
|
||||||
|
|||||||
75
docs/source/quantization/auto_awq.rst
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
.. _auto_awq:
|
||||||
|
|
||||||
|
AutoAWQ
|
||||||
|
==================
|
||||||
|
|
||||||
|
.. warning::
|
||||||
|
|
||||||
|
Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better
|
||||||
|
accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency
|
||||||
|
inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version.
|
||||||
|
|
||||||
|
To create a new 4-bit quantized model, you can leverage `AutoAWQ <https://github.com/casper-hansen/AutoAWQ>`_.
|
||||||
|
Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%.
|
||||||
|
The main benefits are lower latency and memory usage.
|
||||||
|
|
||||||
|
You can quantize your own models by installing AutoAWQ or picking one of the `400+ models on Huggingface <https://huggingface.co/models?sort=trending&search=awq>`_.
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ pip install autoawq
|
||||||
|
|
||||||
|
After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize Vicuna 7B v1.5:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from awq import AutoAWQForCausalLM
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
model_path = 'lmsys/vicuna-7b-v1.5'
|
||||||
|
quant_path = 'vicuna-7b-v1.5-awq'
|
||||||
|
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
|
||||||
|
|
||||||
|
# Load model
|
||||||
|
model = AutoAWQForCausalLM.from_pretrained(model_path, **{"low_cpu_mem_usage": True})
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
|
||||||
|
# Quantize
|
||||||
|
model.quantize(tokenizer, quant_config=quant_config)
|
||||||
|
|
||||||
|
# Save quantized model
|
||||||
|
model.save_quantized(quant_path)
|
||||||
|
tokenizer.save_pretrained(quant_path)
|
||||||
|
|
||||||
|
To run an AWQ model with vLLM, you can use `TheBloke/Llama-2-7b-Chat-AWQ <https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ>`_ with the following command:
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
|
||||||
|
|
||||||
|
AWQ models are also supported directly through the LLM entrypoint:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
# Sample prompts.
|
||||||
|
prompts = [
|
||||||
|
"Hello, my name is",
|
||||||
|
"The president of the United States is",
|
||||||
|
"The capital of France is",
|
||||||
|
"The future of AI is",
|
||||||
|
]
|
||||||
|
# Create a sampling params object.
|
||||||
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||||
|
|
||||||
|
# Create an LLM.
|
||||||
|
llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
|
||||||
|
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||||
|
# that contain the prompt, generated text, and other information.
|
||||||
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
# Print the outputs.
|
||||||
|
for output in outputs:
|
||||||
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
43
docs/source/serving/deploying_with_docker.rst
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
.. _deploying_with_docker:
|
||||||
|
|
||||||
|
Deploying with Docker
|
||||||
|
============================
|
||||||
|
|
||||||
|
vLLM offers official docker image for deployment.
|
||||||
|
The image can be used to run OpenAI compatible server.
|
||||||
|
The image is available on Docker Hub as `vllm/vllm-openai <https://hub.docker.com/r/vllm/vllm-openai/tags>`_.
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ docker run --runtime nvidia --gpus all \
|
||||||
|
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||||
|
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
|
||||||
|
-p 8000:8000 \
|
||||||
|
--ipc=host \
|
||||||
|
vllm/vllm-openai:latest \
|
||||||
|
--model mistralai/Mistral-7B-v0.1
|
||||||
|
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
You can either use the ``ipc=host`` flag or ``--shm-size`` flag to allow the
|
||||||
|
container to access the host's shared memory. vLLM uses PyTorch, which uses shared
|
||||||
|
memory to share data between processes under the hood, particularly for tensor parallel inference.
|
||||||
|
|
||||||
|
|
||||||
|
You can build and run vLLM from source via the provided dockerfile. To build vLLM:
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai --build-arg max_jobs=8
|
||||||
|
|
||||||
|
To run vLLM:
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ docker run --runtime nvidia --gpus all \
|
||||||
|
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||||
|
-p 8000:8000 \
|
||||||
|
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
|
||||||
|
vllm/vllm-openai <args...>
|
||||||
|
|
||||||
6
docs/source/serving/deploying_with_triton.rst
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
.. _deploying_with_triton:
|
||||||
|
|
||||||
|
Deploying with NVIDIA Triton
|
||||||
|
============================
|
||||||
|
|
||||||
|
The `Triton Inference Server <https://github.com/triton-inference-server>`_ hosts a tutorial demonstrating how to quickly deploy a simple `facebook/opt-125m <https://huggingface.co/facebook/opt-125m>`_ model using vLLM. Please see `Deploying a vLLM model in Triton <https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton>`_ for more details.
|
||||||
13
docs/source/serving/metrics.rst
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
Production Metrics
|
||||||
|
==================
|
||||||
|
|
||||||
|
vLLM exposes a number of metrics that can be used to monitor the health of the
|
||||||
|
system. These metrics are exposed via the `/metrics` endpoint on the vLLM
|
||||||
|
OpenAI compatible API server.
|
||||||
|
|
||||||
|
The following metrics are exposed:
|
||||||
|
|
||||||
|
.. literalinclude:: ../../../vllm/engine/metrics.py
|
||||||
|
:language: python
|
||||||
|
:start-after: begin-metrics-definitions
|
||||||
|
:end-before: end-metrics-definitions
|
||||||
31
docs/source/serving/serving_with_langchain.rst
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
.. _run_on_langchain:
|
||||||
|
|
||||||
|
Serving with Langchain
|
||||||
|
============================
|
||||||
|
|
||||||
|
vLLM is also available via `Langchain <https://github.com/langchain-ai/langchain>`_ .
|
||||||
|
|
||||||
|
To install langchain, run
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ pip install langchain -q
|
||||||
|
|
||||||
|
To run inference on a single or multiple GPUs, use ``VLLM`` class from ``langchain``.
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain.llms import VLLM
|
||||||
|
|
||||||
|
llm = VLLM(model="mosaicml/mpt-7b",
|
||||||
|
trust_remote_code=True, # mandatory for hf models
|
||||||
|
max_new_tokens=128,
|
||||||
|
top_k=10,
|
||||||
|
top_p=0.95,
|
||||||
|
temperature=0.8,
|
||||||
|
# tensor_parallel_size=... # for distributed inference
|
||||||
|
)
|
||||||
|
|
||||||
|
print(llm("What is the capital of France ?"))
|
||||||
|
|
||||||
|
Please refer to this `Tutorial <https://github.com/langchain-ai/langchain/blob/master/docs/extras/integrations/llms/vllm.ipynb>`_ for more details.
|
||||||
@@ -39,7 +39,7 @@ def build_demo():
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--host", type=str, default="localhost")
|
parser.add_argument("--host", type=str, default=None)
|
||||||
parser.add_argument("--port", type=int, default=8001)
|
parser.add_argument("--port", type=int, default=8001)
|
||||||
parser.add_argument("--model-url",
|
parser.add_argument("--model-url",
|
||||||
type=str,
|
type=str,
|
||||||
|
|||||||
@@ -1,17 +1,14 @@
|
|||||||
import argparse
|
import argparse
|
||||||
|
from typing import List, Tuple
|
||||||
|
|
||||||
from vllm import EngineArgs, LLMEngine, SamplingParams
|
from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput
|
||||||
|
|
||||||
|
|
||||||
def main(args: argparse.Namespace):
|
def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
|
||||||
# Parse the CLI argument and initialize the engine.
|
"""Create a list of test prompts with their sampling parameters."""
|
||||||
engine_args = EngineArgs.from_cli_args(args)
|
return [
|
||||||
engine = LLMEngine.from_engine_args(engine_args)
|
|
||||||
|
|
||||||
# Test the following prompts.
|
|
||||||
test_prompts = [
|
|
||||||
("A robot may not injure a human being",
|
("A robot may not injure a human being",
|
||||||
SamplingParams(temperature=0.0)),
|
SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1)),
|
||||||
("To be or not to be,",
|
("To be or not to be,",
|
||||||
SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
|
SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
|
||||||
("What is the meaning of life?",
|
("What is the meaning of life?",
|
||||||
@@ -25,22 +22,36 @@ def main(args: argparse.Namespace):
|
|||||||
temperature=0.0)),
|
temperature=0.0)),
|
||||||
]
|
]
|
||||||
|
|
||||||
# Run the engine by calling `engine.step()` manually.
|
|
||||||
|
def process_requests(engine: LLMEngine,
|
||||||
|
test_prompts: List[Tuple[str, SamplingParams]]):
|
||||||
|
"""Continuously process a list of prompts and handle the outputs."""
|
||||||
request_id = 0
|
request_id = 0
|
||||||
while True:
|
|
||||||
# To test continuous batching, we add one request at each step.
|
while test_prompts or engine.has_unfinished_requests():
|
||||||
if test_prompts:
|
if test_prompts:
|
||||||
prompt, sampling_params = test_prompts.pop(0)
|
prompt, sampling_params = test_prompts.pop(0)
|
||||||
engine.add_request(str(request_id), prompt, sampling_params)
|
engine.add_request(str(request_id), prompt, sampling_params)
|
||||||
request_id += 1
|
request_id += 1
|
||||||
|
|
||||||
request_outputs = engine.step()
|
request_outputs: List[RequestOutput] = engine.step()
|
||||||
|
|
||||||
for request_output in request_outputs:
|
for request_output in request_outputs:
|
||||||
if request_output.finished:
|
if request_output.finished:
|
||||||
print(request_output)
|
print(request_output)
|
||||||
|
|
||||||
if not (engine.has_unfinished_requests() or test_prompts):
|
|
||||||
break
|
def initialize_engine(args: argparse.Namespace) -> LLMEngine:
|
||||||
|
"""Initialize the LLMEngine from the command line arguments."""
|
||||||
|
engine_args = EngineArgs.from_cli_args(args)
|
||||||
|
return LLMEngine.from_engine_args(engine_args)
|
||||||
|
|
||||||
|
|
||||||
|
def main(args: argparse.Namespace):
|
||||||
|
"""Main function that sets up and runs the prompt processing."""
|
||||||
|
engine = initialize_engine(args)
|
||||||
|
test_prompts = create_test_prompts()
|
||||||
|
process_requests(engine, test_prompts)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|||||||
@@ -1,18 +1,19 @@
|
|||||||
import openai
|
from openai import OpenAI
|
||||||
|
|
||||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||||
openai.api_key = "EMPTY"
|
openai_api_key = "EMPTY"
|
||||||
openai.api_base = "http://localhost:8000/v1"
|
openai_api_base = "http://localhost:8000/v1"
|
||||||
|
|
||||||
# List models API
|
client = OpenAI(
|
||||||
models = openai.Model.list()
|
# defaults to os.environ.get("OPENAI_API_KEY")
|
||||||
print("Models:", models)
|
api_key=openai_api_key,
|
||||||
|
base_url=openai_api_base,
|
||||||
|
)
|
||||||
|
|
||||||
model = models["data"][0]["id"]
|
models = client.models.list()
|
||||||
|
model = models.data[0].id
|
||||||
|
|
||||||
# Chat completion API
|
chat_completion = client.chat.completions.create(
|
||||||
chat_completion = openai.ChatCompletion.create(
|
|
||||||
model=model,
|
|
||||||
messages=[{
|
messages=[{
|
||||||
"role": "system",
|
"role": "system",
|
||||||
"content": "You are a helpful assistant."
|
"content": "You are a helpful assistant."
|
||||||
@@ -27,7 +28,10 @@ chat_completion = openai.ChatCompletion.create(
|
|||||||
}, {
|
}, {
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "Where was it played?"
|
"content": "Where was it played?"
|
||||||
}])
|
}],
|
||||||
|
model=model,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
print("Chat completion results:")
|
print("Chat completion results:")
|
||||||
print(chat_completion)
|
print(chat_completion)
|
||||||
|
|||||||
@@ -1,24 +1,28 @@
|
|||||||
import openai
|
from openai import OpenAI
|
||||||
|
|
||||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||||
openai.api_key = "EMPTY"
|
openai_api_key = "EMPTY"
|
||||||
openai.api_base = "http://localhost:8000/v1"
|
openai_api_base = "http://localhost:8000/v1"
|
||||||
|
|
||||||
# List models API
|
client = OpenAI(
|
||||||
models = openai.Model.list()
|
# defaults to os.environ.get("OPENAI_API_KEY")
|
||||||
print("Models:", models)
|
api_key=openai_api_key,
|
||||||
|
base_url=openai_api_base,
|
||||||
|
)
|
||||||
|
|
||||||
model = models["data"][0]["id"]
|
models = client.models.list()
|
||||||
|
model = models.data[0].id
|
||||||
|
|
||||||
# Completion API
|
# Completion API
|
||||||
stream = False
|
stream = False
|
||||||
completion = openai.Completion.create(
|
completion = client.completions.create(
|
||||||
model=model,
|
model=model,
|
||||||
prompt="A robot may not injure a human being",
|
prompt="A robot may not injure a human being",
|
||||||
echo=False,
|
echo=False,
|
||||||
n=2,
|
n=2,
|
||||||
stream=stream,
|
stream=stream,
|
||||||
logprobs=3)
|
logprobs=3
|
||||||
|
)
|
||||||
|
|
||||||
print("Completion results:")
|
print("Completion results:")
|
||||||
if stream:
|
if stream:
|
||||||
|
|||||||
29
examples/template_alpaca.jinja
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
|
||||||
|
|
||||||
|
{% for message in messages %}
|
||||||
|
{% if message['role'] == 'user' %}
|
||||||
|
### Instruction:
|
||||||
|
{{ message['content']|trim -}}
|
||||||
|
{% if not loop.last %}
|
||||||
|
|
||||||
|
|
||||||
|
{% endif %}
|
||||||
|
{% elif message['role'] == 'assistant' %}
|
||||||
|
### Response:
|
||||||
|
{{ message['content']|trim -}}
|
||||||
|
{% if not loop.last %}
|
||||||
|
|
||||||
|
|
||||||
|
{% endif %}
|
||||||
|
{% elif message['role'] == 'user_context' %}
|
||||||
|
### Input:
|
||||||
|
{{ message['content']|trim -}}
|
||||||
|
{% if not loop.last %}
|
||||||
|
|
||||||
|
|
||||||
|
{% endif %}
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
|
||||||
|
### Response:
|
||||||
|
{% endif %}
|
||||||
2
examples/template_chatml.jinja
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
|
||||||
|
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
|
||||||
30
examples/template_inkbot.jinja
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
<#meta#>
|
||||||
|
- Date: {{ (messages|selectattr('role', 'equalto', 'meta-current_date')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-current_date')|list) else '' }}
|
||||||
|
- Task: {{ (messages|selectattr('role', 'equalto', 'meta-task_name')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-task_name')|list) else '' }}
|
||||||
|
<#system#>
|
||||||
|
{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
|
||||||
|
<#chat#>
|
||||||
|
{% for message in messages %}
|
||||||
|
{% if message['role'] == 'user' %}
|
||||||
|
<#user#>
|
||||||
|
{{ message['content']|trim -}}
|
||||||
|
{% if not loop.last %}
|
||||||
|
|
||||||
|
{% endif %}
|
||||||
|
{% elif message['role'] == 'assistant' %}
|
||||||
|
<#bot#>
|
||||||
|
{{ message['content']|trim -}}
|
||||||
|
{% if not loop.last %}
|
||||||
|
|
||||||
|
{% endif %}
|
||||||
|
{% elif message['role'] == 'user_context' %}
|
||||||
|
<#user_context#>
|
||||||
|
{{ message['content']|trim -}}
|
||||||
|
{% if not loop.last %}
|
||||||
|
|
||||||
|
{% endif %}
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
|
||||||
|
<#bot#>
|
||||||
|
{% endif %}
|
||||||
49
format.sh
@@ -7,7 +7,7 @@
|
|||||||
# # Format files that differ from origin/main.
|
# # Format files that differ from origin/main.
|
||||||
# bash format.sh
|
# bash format.sh
|
||||||
|
|
||||||
# # Commit changed files with message 'Run yapf and pylint'
|
# # Commit changed files with message 'Run yapf and ruff'
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase.
|
# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase.
|
||||||
@@ -22,7 +22,7 @@ ROOT="$(git rev-parse --show-toplevel)"
|
|||||||
builtin cd "$ROOT" || exit 1
|
builtin cd "$ROOT" || exit 1
|
||||||
|
|
||||||
YAPF_VERSION=$(yapf --version | awk '{print $2}')
|
YAPF_VERSION=$(yapf --version | awk '{print $2}')
|
||||||
PYLINT_VERSION=$(pylint --version | head -n 1 | awk '{print $2}')
|
RUFF_VERSION=$(ruff --version | awk '{print $2}')
|
||||||
MYPY_VERSION=$(mypy --version | awk '{print $2}')
|
MYPY_VERSION=$(mypy --version | awk '{print $2}')
|
||||||
|
|
||||||
# # params: tool name, tool version, required version
|
# # params: tool name, tool version, required version
|
||||||
@@ -34,7 +34,7 @@ tool_version_check() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)"
|
tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)"
|
||||||
tool_version_check "pylint" $PYLINT_VERSION "$(grep "pylint==" requirements-dev.txt | cut -d'=' -f3)"
|
tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)"
|
||||||
tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)"
|
tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)"
|
||||||
|
|
||||||
YAPF_FLAGS=(
|
YAPF_FLAGS=(
|
||||||
@@ -44,7 +44,6 @@ YAPF_FLAGS=(
|
|||||||
|
|
||||||
YAPF_EXCLUDES=(
|
YAPF_EXCLUDES=(
|
||||||
'--exclude' 'build/**'
|
'--exclude' 'build/**'
|
||||||
'--exclude' 'vllm/model_executor/parallel_utils/**'
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Format specified files
|
# Format specified files
|
||||||
@@ -72,7 +71,7 @@ format_changed() {
|
|||||||
|
|
||||||
# Format all files
|
# Format all files
|
||||||
format_all() {
|
format_all() {
|
||||||
yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" vllm
|
yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" vllm tests
|
||||||
}
|
}
|
||||||
|
|
||||||
## This flag formats individual files. --files *must* be the first command line
|
## This flag formats individual files. --files *must* be the first command line
|
||||||
@@ -94,9 +93,43 @@ echo 'vLLM yapf: Done'
|
|||||||
# echo 'vLLM mypy:'
|
# echo 'vLLM mypy:'
|
||||||
# mypy
|
# mypy
|
||||||
|
|
||||||
# Run Pylint
|
# Lint specified files
|
||||||
echo 'vLLM Pylint:'
|
lint() {
|
||||||
pylint vllm
|
ruff "$@"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Lint files that differ from main branch. Ignores dirs that are not slated
|
||||||
|
# for autolint yet.
|
||||||
|
lint_changed() {
|
||||||
|
# The `if` guard ensures that the list of filenames is not empty, which
|
||||||
|
# could cause ruff to receive 0 positional arguments, making it hang
|
||||||
|
# waiting for STDIN.
|
||||||
|
#
|
||||||
|
# `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
|
||||||
|
# exist on both branches.
|
||||||
|
MERGEBASE="$(git merge-base origin/main HEAD)"
|
||||||
|
|
||||||
|
if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
|
||||||
|
git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
|
||||||
|
ruff
|
||||||
|
fi
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run Ruff
|
||||||
|
echo 'vLLM Ruff:'
|
||||||
|
## This flag lints individual files. --files *must* be the first command line
|
||||||
|
## arg to use this option.
|
||||||
|
if [[ "$1" == '--files' ]]; then
|
||||||
|
lint "${@:2}"
|
||||||
|
# If `--all` is passed, then any further arguments are ignored and the
|
||||||
|
# entire python directory is linted.
|
||||||
|
elif [[ "$1" == '--all' ]]; then
|
||||||
|
lint vllm tests
|
||||||
|
else
|
||||||
|
# Format only the files that changed in last commit.
|
||||||
|
lint_changed
|
||||||
|
fi
|
||||||
|
|
||||||
if ! git diff --quiet &>/dev/null; then
|
if ! git diff --quiet &>/dev/null; then
|
||||||
echo 'Reformatted files. Please review and stage the changes.'
|
echo 'Reformatted files. Please review and stage the changes.'
|
||||||
|
|||||||
@@ -1,9 +1,34 @@
|
|||||||
[build-system]
|
[build-system]
|
||||||
|
# Should be mirrored in requirements-build.txt
|
||||||
requires = [
|
requires = [
|
||||||
"ninja",
|
"ninja",
|
||||||
"packaging",
|
"packaging",
|
||||||
"setuptools",
|
"setuptools >= 49.4.0",
|
||||||
"torch >= 2.0.0",
|
"torch >= 2.1.0",
|
||||||
"wheel",
|
"wheel",
|
||||||
]
|
]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = [
|
||||||
|
# pycodestyle
|
||||||
|
"E",
|
||||||
|
# Pyflakes
|
||||||
|
"F",
|
||||||
|
# pyupgrade
|
||||||
|
# "UP",
|
||||||
|
# flake8-bugbear
|
||||||
|
"B",
|
||||||
|
# flake8-simplify
|
||||||
|
"SIM",
|
||||||
|
# isort
|
||||||
|
# "I",
|
||||||
|
]
|
||||||
|
ignore = [
|
||||||
|
# star imports
|
||||||
|
"F405", "F403",
|
||||||
|
# lambda expression assignment
|
||||||
|
"E731",
|
||||||
|
# line too long, handled by black formatting
|
||||||
|
"E501",
|
||||||
|
]
|
||||||
|
|||||||
6
requirements-build.txt
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
# Should be mirrored in pyproject.toml
|
||||||
|
ninja
|
||||||
|
packaging
|
||||||
|
setuptools>=49.4.0
|
||||||
|
torch>=2.1.0
|
||||||
|
wheel
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
# formatting
|
# formatting
|
||||||
yapf==0.32.0
|
yapf==0.32.0
|
||||||
pylint==2.8.2
|
ruff==0.1.5
|
||||||
|
|
||||||
# type checking
|
# type checking
|
||||||
mypy==0.991
|
mypy==0.991
|
||||||
@@ -11,3 +11,5 @@ types-setuptools
|
|||||||
# testing
|
# testing
|
||||||
pytest
|
pytest
|
||||||
pytest-forked
|
pytest-forked
|
||||||
|
pytest-asyncio
|
||||||
|
|
||||||
|
|||||||
@@ -1,11 +1,15 @@
|
|||||||
ninja # For faster builds.
|
ninja # For faster builds.
|
||||||
psutil
|
psutil
|
||||||
ray >= 2.5.1
|
ray >= 2.5.1
|
||||||
|
pandas # Required for Ray data.
|
||||||
|
pyarrow # Required for Ray data.
|
||||||
sentencepiece # Required for LLaMA tokenizer.
|
sentencepiece # Required for LLaMA tokenizer.
|
||||||
numpy
|
numpy
|
||||||
torch >= 2.0.0
|
einops # Required for phi-1_5
|
||||||
transformers >= 4.33.1 # Required for Code Llama.
|
torch >= 2.1.0
|
||||||
xformers >= 0.0.21
|
transformers >= 4.34.0 # Required for Mistral.
|
||||||
|
xformers >= 0.0.22.post7 # Required for CUDA 12.1.
|
||||||
fastapi
|
fastapi
|
||||||
uvicorn
|
uvicorn[standard]
|
||||||
pydantic < 2 # Required for OpenAI server.
|
pydantic == 1.10.13 # Required for OpenAI server.
|
||||||
|
aioprometheus[starlette]
|
||||||
|
|||||||
213
setup.py
@@ -3,6 +3,7 @@ import os
|
|||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
from typing import List, Set
|
from typing import List, Set
|
||||||
|
import warnings
|
||||||
|
|
||||||
from packaging.version import parse, Version
|
from packaging.version import parse, Version
|
||||||
import setuptools
|
import setuptools
|
||||||
@@ -11,6 +12,11 @@ from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
|
|||||||
|
|
||||||
ROOT_DIR = os.path.dirname(__file__)
|
ROOT_DIR = os.path.dirname(__file__)
|
||||||
|
|
||||||
|
MAIN_CUDA_VERSION = "12.1"
|
||||||
|
|
||||||
|
# Supported NVIDIA GPU architectures.
|
||||||
|
SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
|
||||||
|
|
||||||
# Compiler flags.
|
# Compiler flags.
|
||||||
CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
|
CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
|
||||||
# TODO(woosuk): Should we use -O3?
|
# TODO(woosuk): Should we use -O3?
|
||||||
@@ -22,7 +28,7 @@ NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
|
|||||||
|
|
||||||
if CUDA_HOME is None:
|
if CUDA_HOME is None:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Cannot find CUDA_HOME. CUDA must be available to build the package.")
|
"Cannot find CUDA_HOME. CUDA must be available to build the package.")
|
||||||
|
|
||||||
|
|
||||||
def get_nvcc_cuda_version(cuda_dir: str) -> Version:
|
def get_nvcc_cuda_version(cuda_dir: str) -> Version:
|
||||||
@@ -38,47 +44,97 @@ def get_nvcc_cuda_version(cuda_dir: str) -> Version:
|
|||||||
return nvcc_cuda_version
|
return nvcc_cuda_version
|
||||||
|
|
||||||
|
|
||||||
# Collect the compute capabilities of all available GPUs.
|
def get_torch_arch_list() -> Set[str]:
|
||||||
device_count = torch.cuda.device_count()
|
# TORCH_CUDA_ARCH_LIST can have one or more architectures,
|
||||||
compute_capabilities: Set[int] = set()
|
# e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the
|
||||||
for i in range(device_count):
|
# compiler to additionally include PTX code that can be runtime-compiled
|
||||||
major, minor = torch.cuda.get_device_capability(i)
|
# and executed on the 8.6 or newer architectures. While the PTX code will
|
||||||
if major < 7:
|
# not give the best performance on the newer architectures, it provides
|
||||||
|
# forward compatibility.
|
||||||
|
env_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
|
||||||
|
if env_arch_list is None:
|
||||||
|
return set()
|
||||||
|
|
||||||
|
# List are separated by ; or space.
|
||||||
|
torch_arch_list = set(env_arch_list.replace(" ", ";").split(";"))
|
||||||
|
if not torch_arch_list:
|
||||||
|
return set()
|
||||||
|
|
||||||
|
# Filter out the invalid architectures and print a warning.
|
||||||
|
valid_archs = SUPPORTED_ARCHS.union({s + "+PTX" for s in SUPPORTED_ARCHS})
|
||||||
|
arch_list = torch_arch_list.intersection(valid_archs)
|
||||||
|
# If none of the specified architectures are valid, raise an error.
|
||||||
|
if not arch_list:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"GPUs with compute capability less than 7.0 are not supported.")
|
"None of the CUDA architectures in `TORCH_CUDA_ARCH_LIST` env "
|
||||||
compute_capabilities.add(major * 10 + minor)
|
f"variable ({env_arch_list}) is supported. "
|
||||||
|
f"Supported CUDA architectures are: {valid_archs}.")
|
||||||
|
invalid_arch_list = torch_arch_list - valid_archs
|
||||||
|
if invalid_arch_list:
|
||||||
|
warnings.warn(
|
||||||
|
f"Unsupported CUDA architectures ({invalid_arch_list}) are "
|
||||||
|
"excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
|
||||||
|
f"({env_arch_list}). Supported CUDA architectures are: "
|
||||||
|
f"{valid_archs}.",
|
||||||
|
stacklevel=2)
|
||||||
|
return arch_list
|
||||||
|
|
||||||
|
|
||||||
|
# First, check the TORCH_CUDA_ARCH_LIST environment variable.
|
||||||
|
compute_capabilities = get_torch_arch_list()
|
||||||
|
if not compute_capabilities:
|
||||||
|
# If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
|
||||||
|
# GPUs on the current machine.
|
||||||
|
device_count = torch.cuda.device_count()
|
||||||
|
for i in range(device_count):
|
||||||
|
major, minor = torch.cuda.get_device_capability(i)
|
||||||
|
if major < 7:
|
||||||
|
raise RuntimeError(
|
||||||
|
"GPUs with compute capability below 7.0 are not supported.")
|
||||||
|
compute_capabilities.add(f"{major}.{minor}")
|
||||||
|
|
||||||
|
nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
|
||||||
|
if not compute_capabilities:
|
||||||
|
# If no GPU is specified nor available, add all supported architectures
|
||||||
|
# based on the NVCC CUDA version.
|
||||||
|
compute_capabilities = SUPPORTED_ARCHS.copy()
|
||||||
|
if nvcc_cuda_version < Version("11.1"):
|
||||||
|
compute_capabilities.remove("8.6")
|
||||||
|
if nvcc_cuda_version < Version("11.8"):
|
||||||
|
compute_capabilities.remove("8.9")
|
||||||
|
compute_capabilities.remove("9.0")
|
||||||
|
|
||||||
# Validate the NVCC CUDA version.
|
# Validate the NVCC CUDA version.
|
||||||
nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
|
|
||||||
if nvcc_cuda_version < Version("11.0"):
|
if nvcc_cuda_version < Version("11.0"):
|
||||||
raise RuntimeError("CUDA 11.0 or higher is required to build the package.")
|
raise RuntimeError("CUDA 11.0 or higher is required to build the package.")
|
||||||
if 86 in compute_capabilities and nvcc_cuda_version < Version("11.1"):
|
if (nvcc_cuda_version < Version("11.1")
|
||||||
|
and any(cc.startswith("8.6") for cc in compute_capabilities)):
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"CUDA 11.1 or higher is required for GPUs with compute capability 8.6.")
|
"CUDA 11.1 or higher is required for compute capability 8.6.")
|
||||||
if 89 in compute_capabilities and nvcc_cuda_version < Version("11.8"):
|
if nvcc_cuda_version < Version("11.8"):
|
||||||
# CUDA 11.8 is required to generate the code targeting compute capability 8.9.
|
if any(cc.startswith("8.9") for cc in compute_capabilities):
|
||||||
# However, GPUs with compute capability 8.9 can also run the code generated by
|
# CUDA 11.8 is required to generate the code targeting compute capability 8.9.
|
||||||
# the previous versions of CUDA 11 and targeting compute capability 8.0.
|
# However, GPUs with compute capability 8.9 can also run the code generated by
|
||||||
# Therefore, if CUDA 11.8 is not available, we target compute capability 8.0
|
# the previous versions of CUDA 11 and targeting compute capability 8.0.
|
||||||
# instead of 8.9.
|
# Therefore, if CUDA 11.8 is not available, we target compute capability 8.0
|
||||||
compute_capabilities.remove(89)
|
# instead of 8.9.
|
||||||
compute_capabilities.add(80)
|
warnings.warn(
|
||||||
if 90 in compute_capabilities and nvcc_cuda_version < Version("11.8"):
|
"CUDA 11.8 or higher is required for compute capability 8.9. "
|
||||||
raise RuntimeError(
|
"Targeting compute capability 8.0 instead.",
|
||||||
"CUDA 11.8 or higher is required for GPUs with compute capability 9.0.")
|
stacklevel=2)
|
||||||
|
compute_capabilities = set(cc for cc in compute_capabilities
|
||||||
# If no GPU is available, add all supported compute capabilities.
|
if not cc.startswith("8.9"))
|
||||||
if not compute_capabilities:
|
compute_capabilities.add("8.0+PTX")
|
||||||
compute_capabilities = {70, 75, 80}
|
if any(cc.startswith("9.0") for cc in compute_capabilities):
|
||||||
if nvcc_cuda_version >= Version("11.1"):
|
raise RuntimeError(
|
||||||
compute_capabilities.add(86)
|
"CUDA 11.8 or higher is required for compute capability 9.0.")
|
||||||
if nvcc_cuda_version >= Version("11.8"):
|
|
||||||
compute_capabilities.add(89)
|
|
||||||
compute_capabilities.add(90)
|
|
||||||
|
|
||||||
# Add target compute capabilities to NVCC flags.
|
# Add target compute capabilities to NVCC flags.
|
||||||
for capability in compute_capabilities:
|
for capability in compute_capabilities:
|
||||||
NVCC_FLAGS += ["-gencode", f"arch=compute_{capability},code=sm_{capability}"]
|
num = capability[0] + capability[2]
|
||||||
|
NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"]
|
||||||
|
if capability.endswith("+PTX"):
|
||||||
|
NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=compute_{num}"]
|
||||||
|
|
||||||
# Use NVCC threads to parallelize the build.
|
# Use NVCC threads to parallelize the build.
|
||||||
if nvcc_cuda_version >= Version("11.2"):
|
if nvcc_cuda_version >= Version("11.2"):
|
||||||
@@ -86,68 +142,60 @@ if nvcc_cuda_version >= Version("11.2"):
|
|||||||
NVCC_FLAGS += ["--threads", str(num_threads)]
|
NVCC_FLAGS += ["--threads", str(num_threads)]
|
||||||
|
|
||||||
ext_modules = []
|
ext_modules = []
|
||||||
|
vllm_extension = CUDAExtension(
|
||||||
# Cache operations.
|
name="vllm._C",
|
||||||
cache_extension = CUDAExtension(
|
sources=[
|
||||||
name="vllm.cache_ops",
|
"csrc/cache_kernels.cu",
|
||||||
sources=["csrc/cache.cpp", "csrc/cache_kernels.cu"],
|
"csrc/attention/attention_kernels.cu",
|
||||||
extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
|
"csrc/pos_encoding_kernels.cu",
|
||||||
|
"csrc/activation_kernels.cu",
|
||||||
|
"csrc/layernorm_kernels.cu",
|
||||||
|
"csrc/quantization/awq/gemm_kernels.cu",
|
||||||
|
"csrc/quantization/squeezellm/quant_cuda_kernel.cu",
|
||||||
|
"csrc/cuda_utils_kernels.cu",
|
||||||
|
"csrc/pybind.cpp",
|
||||||
|
],
|
||||||
|
extra_compile_args={
|
||||||
|
"cxx": CXX_FLAGS,
|
||||||
|
"nvcc": NVCC_FLAGS,
|
||||||
|
},
|
||||||
)
|
)
|
||||||
ext_modules.append(cache_extension)
|
ext_modules.append(vllm_extension)
|
||||||
|
|
||||||
# Attention kernels.
|
|
||||||
attention_extension = CUDAExtension(
|
|
||||||
name="vllm.attention_ops",
|
|
||||||
sources=["csrc/attention.cpp", "csrc/attention/attention_kernels.cu"],
|
|
||||||
extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
|
|
||||||
)
|
|
||||||
ext_modules.append(attention_extension)
|
|
||||||
|
|
||||||
# Positional encoding kernels.
|
|
||||||
positional_encoding_extension = CUDAExtension(
|
|
||||||
name="vllm.pos_encoding_ops",
|
|
||||||
sources=["csrc/pos_encoding.cpp", "csrc/pos_encoding_kernels.cu"],
|
|
||||||
extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
|
|
||||||
)
|
|
||||||
ext_modules.append(positional_encoding_extension)
|
|
||||||
|
|
||||||
# Layer normalization kernels.
|
|
||||||
layernorm_extension = CUDAExtension(
|
|
||||||
name="vllm.layernorm_ops",
|
|
||||||
sources=["csrc/layernorm.cpp", "csrc/layernorm_kernels.cu"],
|
|
||||||
extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
|
|
||||||
)
|
|
||||||
ext_modules.append(layernorm_extension)
|
|
||||||
|
|
||||||
# Activation kernels.
|
|
||||||
activation_extension = CUDAExtension(
|
|
||||||
name="vllm.activation_ops",
|
|
||||||
sources=["csrc/activation.cpp", "csrc/activation_kernels.cu"],
|
|
||||||
extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
|
|
||||||
)
|
|
||||||
ext_modules.append(activation_extension)
|
|
||||||
|
|
||||||
|
|
||||||
def get_path(*filepath) -> str:
|
def get_path(*filepath) -> str:
|
||||||
return os.path.join(ROOT_DIR, *filepath)
|
return os.path.join(ROOT_DIR, *filepath)
|
||||||
|
|
||||||
|
|
||||||
def find_version(filepath: str):
|
def find_version(filepath: str) -> str:
|
||||||
"""Extract version information from the given filepath.
|
"""Extract version information from the given filepath.
|
||||||
|
|
||||||
Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
|
Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
|
||||||
"""
|
"""
|
||||||
with open(filepath) as fp:
|
with open(filepath) as fp:
|
||||||
version_match = re.search(
|
version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
|
||||||
r"^__version__ = ['\"]([^'\"]*)['\"]", fp.read(), re.M)
|
fp.read(), re.M)
|
||||||
if version_match:
|
if version_match:
|
||||||
return version_match.group(1)
|
return version_match.group(1)
|
||||||
raise RuntimeError("Unable to find version string.")
|
raise RuntimeError("Unable to find version string.")
|
||||||
|
|
||||||
|
|
||||||
|
def get_vllm_version() -> str:
|
||||||
|
version = find_version(get_path("vllm", "__init__.py"))
|
||||||
|
cuda_version = str(nvcc_cuda_version)
|
||||||
|
if cuda_version != MAIN_CUDA_VERSION:
|
||||||
|
cuda_version_str = cuda_version.replace(".", "")[:3]
|
||||||
|
version += f"+cu{cuda_version_str}"
|
||||||
|
return version
|
||||||
|
|
||||||
|
|
||||||
def read_readme() -> str:
|
def read_readme() -> str:
|
||||||
"""Read the README file."""
|
"""Read the README file if present."""
|
||||||
return io.open(get_path("README.md"), "r", encoding="utf-8").read()
|
p = get_path("README.md")
|
||||||
|
if os.path.isfile(p):
|
||||||
|
return io.open(get_path("README.md"), "r", encoding="utf-8").read()
|
||||||
|
else:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def get_requirements() -> List[str]:
|
def get_requirements() -> List[str]:
|
||||||
@@ -159,10 +207,11 @@ def get_requirements() -> List[str]:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name="vllm",
|
name="vllm",
|
||||||
version=find_version(get_path("vllm", "__init__.py")),
|
version=get_vllm_version(),
|
||||||
author="vLLM Team",
|
author="vLLM Team",
|
||||||
license="Apache 2.0",
|
license="Apache 2.0",
|
||||||
description="A high-throughput and memory-efficient inference and serving engine for LLMs",
|
description=("A high-throughput and memory-efficient inference and "
|
||||||
|
"serving engine for LLMs"),
|
||||||
long_description=read_readme(),
|
long_description=read_readme(),
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
url="https://github.com/vllm-project/vllm",
|
url="https://github.com/vllm-project/vllm",
|
||||||
@@ -174,13 +223,15 @@ setuptools.setup(
|
|||||||
"Programming Language :: Python :: 3.8",
|
"Programming Language :: Python :: 3.8",
|
||||||
"Programming Language :: Python :: 3.9",
|
"Programming Language :: Python :: 3.9",
|
||||||
"Programming Language :: Python :: 3.10",
|
"Programming Language :: Python :: 3.10",
|
||||||
|
"Programming Language :: Python :: 3.11",
|
||||||
"License :: OSI Approved :: Apache Software License",
|
"License :: OSI Approved :: Apache Software License",
|
||||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||||
],
|
],
|
||||||
packages=setuptools.find_packages(
|
packages=setuptools.find_packages(exclude=("benchmarks", "csrc", "docs",
|
||||||
exclude=("assets", "benchmarks", "csrc", "docs", "examples", "tests")),
|
"examples", "tests")),
|
||||||
python_requires=">=3.8",
|
python_requires=">=3.8",
|
||||||
install_requires=get_requirements(),
|
install_requires=get_requirements(),
|
||||||
ext_modules=ext_modules,
|
ext_modules=ext_modules,
|
||||||
cmdclass={"build_ext": BuildExtension},
|
cmdclass={"build_ext": BuildExtension},
|
||||||
|
package_data={"vllm": ["py.typed"]},
|
||||||
)
|
)
|
||||||
|
|||||||
0
tests/__init__.py
Normal file
@@ -40,8 +40,7 @@ if __name__ == "__main__":
|
|||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
engine_args = AsyncEngineArgs.from_cli_args(args)
|
engine_args = AsyncEngineArgs.from_cli_args(args)
|
||||||
engine = AsyncLLMEngineWithStats.from_engine_args(engine_args,
|
engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
|
||||||
start_engine_loop=False)
|
|
||||||
vllm.entrypoints.api_server.engine = engine
|
vllm.entrypoints.api_server.engine = engine
|
||||||
uvicorn.run(
|
uvicorn.run(
|
||||||
app,
|
app,
|
||||||
|
|||||||
@@ -48,9 +48,9 @@ def test_api_server(api_server):
|
|||||||
result = None
|
result = None
|
||||||
while not result:
|
while not result:
|
||||||
try:
|
try:
|
||||||
for result in pool.map(_query_server, prompts):
|
for _ in pool.map(_query_server, prompts):
|
||||||
break
|
break
|
||||||
except:
|
except Exception:
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
# Actual tests start here
|
# Actual tests start here
|
||||||
|
|||||||
80
tests/async_engine/test_async_llm_engine.py
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
import asyncio
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RequestOutput:
|
||||||
|
request_id: int
|
||||||
|
finished: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
class MockEngine:
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.step_calls = 0
|
||||||
|
self.add_request_calls = 0
|
||||||
|
self.abort_request_calls = 0
|
||||||
|
self.request_id = None
|
||||||
|
|
||||||
|
async def step_async(self):
|
||||||
|
self.step_calls += 1
|
||||||
|
return [RequestOutput(
|
||||||
|
request_id=self.request_id)] if self.request_id else []
|
||||||
|
|
||||||
|
def generate(self, request_id):
|
||||||
|
self.request_id = request_id
|
||||||
|
|
||||||
|
def stop_generating(self):
|
||||||
|
self.request_id = None
|
||||||
|
|
||||||
|
def add_request(self, **kwargs):
|
||||||
|
del kwargs # Unused
|
||||||
|
self.add_request_calls += 1
|
||||||
|
|
||||||
|
def abort_request(self, request_id):
|
||||||
|
del request_id # Unused
|
||||||
|
self.abort_request_calls += 1
|
||||||
|
|
||||||
|
|
||||||
|
class MockAsyncLLMEngine(AsyncLLMEngine):
|
||||||
|
|
||||||
|
def _init_engine(self, *args, **kwargs):
|
||||||
|
return MockEngine()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_new_requests_event():
|
||||||
|
engine = MockAsyncLLMEngine(worker_use_ray=False, engine_use_ray=False)
|
||||||
|
engine.start_background_loop()
|
||||||
|
await asyncio.sleep(0.01)
|
||||||
|
assert engine.engine.step_calls == 0
|
||||||
|
|
||||||
|
await engine.add_request("1", "", None)
|
||||||
|
await asyncio.sleep(0.01)
|
||||||
|
assert engine.engine.add_request_calls == 1
|
||||||
|
assert engine.engine.step_calls == 1
|
||||||
|
|
||||||
|
await engine.add_request("2", "", None)
|
||||||
|
engine.engine.generate("2")
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
assert engine.engine.add_request_calls == 2
|
||||||
|
assert engine.engine.step_calls == 2
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
assert engine.engine.step_calls == 3
|
||||||
|
engine.engine.stop_generating()
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
assert engine.engine.step_calls == 4
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
assert engine.engine.step_calls == 4
|
||||||
|
|
||||||
|
await engine.add_request("3", "", None)
|
||||||
|
await asyncio.sleep(0.01)
|
||||||
|
assert engine.engine.add_request_calls == 3
|
||||||
|
assert engine.engine.step_calls == 5
|
||||||
|
await asyncio.sleep(0.01)
|
||||||
|
assert engine.engine.add_request_calls == 3
|
||||||
|
assert engine.engine.step_calls == 5
|
||||||
119
tests/async_engine/test_openai_server.py
Normal file
@@ -0,0 +1,119 @@
|
|||||||
|
from argparse import Namespace
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
from vllm.entrypoints.openai.api_server import *
|
||||||
|
|
||||||
|
# Define models, templates, and their corresponding expected outputs
|
||||||
|
MODEL_TEMPLATE_GENERATON_OUTPUT = [
|
||||||
|
("facebook/opt-125m", None, True,
|
||||||
|
"Hello</s>Hi there!</s>What is the capital of</s>"),
|
||||||
|
("facebook/opt-125m", None, False,
|
||||||
|
"Hello</s>Hi there!</s>What is the capital of</s>"),
|
||||||
|
("facebook/opt-125m", "../../examples/template_chatml.jinja", True,
|
||||||
|
"""<|im_start|>user
|
||||||
|
Hello<|im_end|>
|
||||||
|
<|im_start|>assistant
|
||||||
|
Hi there!<|im_end|>
|
||||||
|
<|im_start|>user
|
||||||
|
What is the capital of<|im_end|>
|
||||||
|
<|im_start|>assistant
|
||||||
|
"""),
|
||||||
|
("facebook/opt-125m", "../../examples/template_chatml.jinja", False,
|
||||||
|
"""<|im_start|>user
|
||||||
|
Hello<|im_end|>
|
||||||
|
<|im_start|>assistant
|
||||||
|
Hi there!<|im_end|>
|
||||||
|
<|im_start|>user
|
||||||
|
What is the capital of""")
|
||||||
|
]
|
||||||
|
|
||||||
|
TEST_MESSAGES = [
|
||||||
|
{
|
||||||
|
'role': 'user',
|
||||||
|
'content': 'Hello'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'role': 'assistant',
|
||||||
|
'content': 'Hi there!'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'role': 'user',
|
||||||
|
'content': 'What is the capital of'
|
||||||
|
},
|
||||||
|
]
|
||||||
|
client = TestClient(app)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MockTokenizer:
|
||||||
|
chat_template = None
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_chat_template():
|
||||||
|
# Testing chatml template
|
||||||
|
template = "../../examples/template_chatml.jinja"
|
||||||
|
mock_args = Namespace(chat_template=template)
|
||||||
|
tokenizer = MockTokenizer()
|
||||||
|
|
||||||
|
# Call the function with the mocked args
|
||||||
|
load_chat_template(mock_args, tokenizer)
|
||||||
|
|
||||||
|
template_content = tokenizer.chat_template
|
||||||
|
|
||||||
|
# Test assertions
|
||||||
|
assert template_content is not None
|
||||||
|
# Hard coded value for template_chatml.jinja
|
||||||
|
assert template_content == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
|
||||||
|
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_load_chat_template():
|
||||||
|
# Testing chatml template
|
||||||
|
template = "../../examples/does_not_exist"
|
||||||
|
mock_args = Namespace(chat_template=template)
|
||||||
|
tokenizer = MockTokenizer()
|
||||||
|
|
||||||
|
# Call the function with the mocked args
|
||||||
|
load_chat_template(mock_args, tokenizer=tokenizer)
|
||||||
|
template_content = tokenizer.chat_template
|
||||||
|
|
||||||
|
# Test assertions
|
||||||
|
assert template_content is not None
|
||||||
|
# Hard coded value for template_chatml.jinja
|
||||||
|
assert template_content == """../../examples/does_not_exist"""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"model,template,add_generation_prompt,expected_output",
|
||||||
|
MODEL_TEMPLATE_GENERATON_OUTPUT)
|
||||||
|
async def test_get_gen_prompt(model, template, add_generation_prompt,
|
||||||
|
expected_output):
|
||||||
|
# Initialize the tokenizer
|
||||||
|
tokenizer = get_tokenizer(tokenizer_name=model)
|
||||||
|
|
||||||
|
mock_args = Namespace(chat_template=template)
|
||||||
|
load_chat_template(mock_args, tokenizer)
|
||||||
|
|
||||||
|
# Create a mock request object using keyword arguments
|
||||||
|
mock_request = ChatCompletionRequest(
|
||||||
|
model=model,
|
||||||
|
messages=TEST_MESSAGES,
|
||||||
|
add_generation_prompt=add_generation_prompt)
|
||||||
|
|
||||||
|
# Call the function and get the result
|
||||||
|
result = tokenizer.apply_chat_template(
|
||||||
|
conversation=mock_request.messages,
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=mock_request.add_generation_prompt)
|
||||||
|
|
||||||
|
# Test assertion
|
||||||
|
assert result == expected_output, f"The generated prompt does not match the expected output for model {model} and template {template}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_health_endpoint():
|
||||||
|
response = client.get("/health")
|
||||||
|
assert response.status_code == 200
|
||||||
@@ -4,10 +4,25 @@ from vllm.engine.async_llm_engine import RequestTracker
|
|||||||
from vllm.outputs import RequestOutput
|
from vllm.outputs import RequestOutput
|
||||||
|
|
||||||
|
|
||||||
|
class DummyEvent:
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.flag = False
|
||||||
|
|
||||||
|
def set(self):
|
||||||
|
self.flag = True
|
||||||
|
|
||||||
|
def clear(self):
|
||||||
|
self.flag = False
|
||||||
|
|
||||||
|
|
||||||
def test_request_tracker():
|
def test_request_tracker():
|
||||||
tracker = RequestTracker()
|
tracker = RequestTracker()
|
||||||
|
tracker.new_requests_event = DummyEvent()
|
||||||
stream_1 = tracker.add_request("1")
|
stream_1 = tracker.add_request("1")
|
||||||
|
assert tracker.new_requests_event.flag
|
||||||
new, finished = tracker.get_new_and_finished_requests()
|
new, finished = tracker.get_new_and_finished_requests()
|
||||||
|
assert not tracker.new_requests_event.flag
|
||||||
assert len(new) == 1
|
assert len(new) == 1
|
||||||
assert new[0]["request_id"] == "1"
|
assert new[0]["request_id"] == "1"
|
||||||
assert not finished
|
assert not finished
|
||||||
@@ -15,7 +30,9 @@ def test_request_tracker():
|
|||||||
|
|
||||||
stream_2 = tracker.add_request("2")
|
stream_2 = tracker.add_request("2")
|
||||||
stream_3 = tracker.add_request("3")
|
stream_3 = tracker.add_request("3")
|
||||||
|
assert tracker.new_requests_event.flag
|
||||||
new, finished = tracker.get_new_and_finished_requests()
|
new, finished = tracker.get_new_and_finished_requests()
|
||||||
|
assert not tracker.new_requests_event.flag
|
||||||
assert len(new) == 2
|
assert len(new) == 2
|
||||||
assert new[0]["request_id"] == "2"
|
assert new[0]["request_id"] == "2"
|
||||||
assert new[1]["request_id"] == "3"
|
assert new[1]["request_id"] == "3"
|
||||||
@@ -26,6 +43,7 @@ def test_request_tracker():
|
|||||||
# request_ids must be unique
|
# request_ids must be unique
|
||||||
with pytest.raises(KeyError):
|
with pytest.raises(KeyError):
|
||||||
tracker.add_request("1")
|
tracker.add_request("1")
|
||||||
|
assert not tracker.new_requests_event.flag
|
||||||
|
|
||||||
tracker.abort_request("1")
|
tracker.abort_request("1")
|
||||||
new, finished = tracker.get_new_and_finished_requests()
|
new, finished = tracker.get_new_and_finished_requests()
|
||||||
@@ -36,6 +54,7 @@ def test_request_tracker():
|
|||||||
|
|
||||||
stream_4 = tracker.add_request("4")
|
stream_4 = tracker.add_request("4")
|
||||||
tracker.abort_request("4")
|
tracker.abort_request("4")
|
||||||
|
assert tracker.new_requests_event.flag
|
||||||
new, finished = tracker.get_new_and_finished_requests()
|
new, finished = tracker.get_new_and_finished_requests()
|
||||||
assert len(finished) == 1
|
assert len(finished) == 1
|
||||||
assert "4" in finished
|
assert "4" in finished
|
||||||
@@ -43,9 +62,11 @@ def test_request_tracker():
|
|||||||
assert stream_4.finished
|
assert stream_4.finished
|
||||||
|
|
||||||
stream_5 = tracker.add_request("5")
|
stream_5 = tracker.add_request("5")
|
||||||
|
assert tracker.new_requests_event.flag
|
||||||
tracker.process_request_output(
|
tracker.process_request_output(
|
||||||
RequestOutput("2", "output", [], [], finished=True))
|
RequestOutput("2", "output", [], [], [], finished=True))
|
||||||
new, finished = tracker.get_new_and_finished_requests()
|
new, finished = tracker.get_new_and_finished_requests()
|
||||||
|
assert not tracker.new_requests_event.flag
|
||||||
assert len(finished) == 1
|
assert len(finished) == 1
|
||||||
assert "2" in finished
|
assert "2" in finished
|
||||||
assert len(new) == 1
|
assert len(new) == 1
|
||||||
|
|||||||
@@ -106,6 +106,39 @@ class HfRunner:
|
|||||||
outputs[i] = (output_ids, output_str)
|
outputs[i] = (output_ids, output_str)
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
|
def generate_greedy_logprobs(
|
||||||
|
self,
|
||||||
|
prompts: List[str],
|
||||||
|
max_tokens: int,
|
||||||
|
) -> List[List[torch.Tensor]]:
|
||||||
|
all_logprobs = []
|
||||||
|
for prompt in prompts:
|
||||||
|
input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
|
||||||
|
output = self.model.generate(
|
||||||
|
input_ids.cuda(),
|
||||||
|
use_cache=True,
|
||||||
|
do_sample=False,
|
||||||
|
max_new_tokens=max_tokens,
|
||||||
|
output_hidden_states=True,
|
||||||
|
return_dict_in_generate=True,
|
||||||
|
)
|
||||||
|
seq_logprobs = []
|
||||||
|
for hidden_states in output.hidden_states:
|
||||||
|
last_hidden_states = hidden_states[-1][0]
|
||||||
|
logits = torch.matmul(
|
||||||
|
last_hidden_states,
|
||||||
|
self.model.get_output_embeddings().weight.t(),
|
||||||
|
)
|
||||||
|
if self.model.get_output_embeddings().bias is not None:
|
||||||
|
logits += self.model.get_output_embeddings(
|
||||||
|
).bias.unsqueeze(0)
|
||||||
|
logprobs = torch.nn.functional.log_softmax(logits,
|
||||||
|
dim=-1,
|
||||||
|
dtype=torch.float32)
|
||||||
|
seq_logprobs.append(logprobs)
|
||||||
|
all_logprobs.append(seq_logprobs)
|
||||||
|
return all_logprobs
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def hf_runner():
|
def hf_runner():
|
||||||
|
|||||||
83
tests/distributed/test_comm_ops.py
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
"""Test the communication operators.
|
||||||
|
|
||||||
|
Run `pytest tests/distributed/test_comm_ops.py --forked`.
|
||||||
|
"""
|
||||||
|
from multiprocessing import Process, set_start_method
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm.config import ParallelConfig
|
||||||
|
from vllm.engine.ray_utils import get_open_port
|
||||||
|
from vllm.model_executor.parallel_utils.communication_op import (
|
||||||
|
tensor_model_parallel_all_reduce,
|
||||||
|
tensor_model_parallel_all_gather,
|
||||||
|
)
|
||||||
|
from vllm.worker.worker import _init_distributed_environment
|
||||||
|
|
||||||
|
|
||||||
|
def init_test_distributed_environment(pipeline_parallel_size: int,
|
||||||
|
tensor_parallel_size: int, rank: int,
|
||||||
|
distributed_init_port: str):
|
||||||
|
parallel_config = ParallelConfig(pipeline_parallel_size,
|
||||||
|
tensor_parallel_size,
|
||||||
|
worker_use_ray=True)
|
||||||
|
distributed_init_method = f"tcp://localhost:{distributed_init_port}"
|
||||||
|
torch.cuda.set_device(rank)
|
||||||
|
_init_distributed_environment(parallel_config, rank,
|
||||||
|
distributed_init_method)
|
||||||
|
|
||||||
|
|
||||||
|
def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
|
||||||
|
distributed_init_port: str):
|
||||||
|
init_test_distributed_environment(1, tensor_parallel_size, rank,
|
||||||
|
distributed_init_port)
|
||||||
|
num_elements = 8
|
||||||
|
all_tensors = [
|
||||||
|
torch.arange(num_elements, dtype=torch.float32, device="cuda") *
|
||||||
|
(r + 1) for r in range(tensor_parallel_size)
|
||||||
|
]
|
||||||
|
expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
|
||||||
|
t = all_tensors[rank]
|
||||||
|
t = tensor_model_parallel_all_reduce(t)
|
||||||
|
assert torch.allclose(t, expected)
|
||||||
|
|
||||||
|
|
||||||
|
def all_gather_test_worker(tensor_parallel_size: int, rank: int,
|
||||||
|
distributed_init_port: str):
|
||||||
|
init_test_distributed_environment(1, tensor_parallel_size, rank,
|
||||||
|
distributed_init_port)
|
||||||
|
num_dimensions = 3
|
||||||
|
tensor_size = list(range(2, num_dimensions + 2))
|
||||||
|
total_size = 1
|
||||||
|
for s in tensor_size:
|
||||||
|
total_size *= s
|
||||||
|
for all_gather_dimension in range(num_dimensions):
|
||||||
|
all_tensors = [
|
||||||
|
torch.arange(total_size, dtype=torch.float32,
|
||||||
|
device="cuda").reshape(tensor_size) * (r + 1)
|
||||||
|
for r in range(tensor_parallel_size)
|
||||||
|
]
|
||||||
|
expected = torch.cat(all_tensors, dim=all_gather_dimension)
|
||||||
|
t = all_tensors[rank]
|
||||||
|
t = tensor_model_parallel_all_gather(t, all_gather_dimension)
|
||||||
|
assert torch.allclose(t, expected)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||||
|
reason="Need at least 2 GPUs to run the test.")
|
||||||
|
@pytest.mark.parametrize("tensor_parallel_size", [2])
|
||||||
|
@pytest.mark.parametrize("test_target",
|
||||||
|
[all_reduce_test_worker, all_gather_test_worker])
|
||||||
|
def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
|
||||||
|
set_start_method("spawn", force=True)
|
||||||
|
distributed_init_port = get_open_port()
|
||||||
|
processes = []
|
||||||
|
for rank in range(tensor_parallel_size):
|
||||||
|
p = Process(target=test_target,
|
||||||
|
args=(tensor_parallel_size, rank, distributed_init_port))
|
||||||
|
p.start()
|
||||||
|
processes.append(p)
|
||||||
|
for p in processes:
|
||||||
|
p.join()
|
||||||
|
assert all(p.exitcode == 0 for p in processes)
|
||||||
62
tests/engine/test_detokenize.py
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
from vllm.transformers_utils.tokenizer import detokenize_incrementally
|
||||||
|
|
||||||
|
TRUTH = [
|
||||||
|
"Hello here, this is a simple test", # noqa: E501
|
||||||
|
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving", # noqa: E501
|
||||||
|
"我很感谢你的热情" # noqa: E501
|
||||||
|
]
|
||||||
|
TOKENIZERS = [
|
||||||
|
"facebook/opt-125m",
|
||||||
|
"gpt2",
|
||||||
|
"bigcode/tiny_starcoder_py",
|
||||||
|
"EleutherAI/gpt-j-6b",
|
||||||
|
"EleutherAI/pythia-70m",
|
||||||
|
"bigscience/bloom-560m",
|
||||||
|
"mosaicml/mpt-7b",
|
||||||
|
"tiiuae/falcon-7b",
|
||||||
|
"meta-llama/Llama-2-7b-hf",
|
||||||
|
"codellama/CodeLlama-7b-hf",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _run_incremental_decode(tokenizer, all_input_ids,
|
||||||
|
skip_special_tokens: bool):
|
||||||
|
decoded_text = ""
|
||||||
|
offset = 0
|
||||||
|
token_offset = 0
|
||||||
|
prev_tokens = None
|
||||||
|
for i in range(len(all_input_ids)):
|
||||||
|
new_tokens, text, offset, token_offset = detokenize_incrementally(
|
||||||
|
tokenizer,
|
||||||
|
all_input_ids[:i + 1],
|
||||||
|
prev_tokens,
|
||||||
|
offset,
|
||||||
|
token_offset,
|
||||||
|
skip_special_tokens=skip_special_tokens)
|
||||||
|
decoded_text += text
|
||||||
|
if prev_tokens is None:
|
||||||
|
prev_tokens = new_tokens
|
||||||
|
else:
|
||||||
|
prev_tokens += new_tokens
|
||||||
|
return decoded_text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("truth", TRUTH)
|
||||||
|
@pytest.mark.parametrize("tokenizer_id", TOKENIZERS)
|
||||||
|
@pytest.mark.parametrize("skip_special_tokens", (True, False))
|
||||||
|
def test_decode_streaming(tokenizer_id, truth, skip_special_tokens):
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
|
||||||
|
all_input_ids = tokenizer(truth, add_special_tokens=False)["input_ids"]
|
||||||
|
if skip_special_tokens:
|
||||||
|
all_input_ids = ([tokenizer.bos_token_id]
|
||||||
|
if tokenizer.bos_token_id is not None else
|
||||||
|
[]) + all_input_ids + [tokenizer.eos_token_id]
|
||||||
|
|
||||||
|
decoded_text = _run_incremental_decode(
|
||||||
|
tokenizer, all_input_ids, skip_special_tokens=skip_special_tokens)
|
||||||
|
|
||||||
|
assert decoded_text == truth
|
||||||
@@ -1,9 +1,7 @@
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
|
||||||
from transformers.activations import get_activation
|
|
||||||
|
|
||||||
from vllm import activation_ops
|
from vllm.model_executor.layers.activation import FastGELU, NewGELU, SiluAndMul
|
||||||
|
|
||||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||||
NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing
|
NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing
|
||||||
@@ -11,11 +9,6 @@ D = [512, 4096, 5120, 13824] # Arbitrary values for testing
|
|||||||
SEEDS = [0]
|
SEEDS = [0]
|
||||||
|
|
||||||
|
|
||||||
def ref_silu_and_mul(x: torch.Tensor) -> torch.Tensor:
|
|
||||||
x1, x2 = x.chunk(chunks=2, dim=1)
|
|
||||||
return F.silu(x1) * x2
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||||
@pytest.mark.parametrize("d", D)
|
@pytest.mark.parametrize("d", D)
|
||||||
@pytest.mark.parametrize("dtype", DTYPES)
|
@pytest.mark.parametrize("dtype", DTYPES)
|
||||||
@@ -29,10 +22,10 @@ def test_silu_and_mul(
|
|||||||
) -> None:
|
) -> None:
|
||||||
torch.random.manual_seed(seed)
|
torch.random.manual_seed(seed)
|
||||||
torch.cuda.manual_seed(seed)
|
torch.cuda.manual_seed(seed)
|
||||||
x = torch.randn(num_tokens, 2 * d, dtype=dtype, device='cuda')
|
x = torch.randn(num_tokens, 2 * d, dtype=dtype, device="cuda")
|
||||||
out = torch.empty(num_tokens, d, dtype=dtype, device='cuda')
|
layer = SiluAndMul()
|
||||||
activation_ops.silu_and_mul(out, x)
|
out = layer(x)
|
||||||
ref_out = ref_silu_and_mul(x)
|
ref_out = layer._forward(x)
|
||||||
assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
|
assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
|
||||||
|
|
||||||
|
|
||||||
@@ -49,10 +42,10 @@ def test_gelu_new(
|
|||||||
) -> None:
|
) -> None:
|
||||||
torch.random.manual_seed(seed)
|
torch.random.manual_seed(seed)
|
||||||
torch.cuda.manual_seed(seed)
|
torch.cuda.manual_seed(seed)
|
||||||
x = torch.randn(num_tokens, d, dtype=dtype, device='cuda')
|
x = torch.randn(num_tokens, d, dtype=dtype, device="cuda")
|
||||||
out = torch.empty(num_tokens, d, dtype=dtype, device='cuda')
|
layer = NewGELU()
|
||||||
activation_ops.gelu_new(out, x)
|
out = layer(x)
|
||||||
ref_out = get_activation("gelu_new")(x)
|
ref_out = layer._forward(x)
|
||||||
assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
|
assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
|
||||||
|
|
||||||
|
|
||||||
@@ -68,8 +61,8 @@ def test_gelu_fast(
|
|||||||
) -> None:
|
) -> None:
|
||||||
torch.random.manual_seed(seed)
|
torch.random.manual_seed(seed)
|
||||||
torch.cuda.manual_seed(seed)
|
torch.cuda.manual_seed(seed)
|
||||||
x = torch.randn(num_tokens, d, dtype=dtype, device='cuda')
|
x = torch.randn(num_tokens, d, dtype=dtype, device="cuda")
|
||||||
out = torch.empty(num_tokens, d, dtype=dtype, device='cuda')
|
layer = FastGELU()
|
||||||
activation_ops.gelu_fast(out, x)
|
out = layer(x)
|
||||||
ref_out = get_activation("gelu_fast")(x)
|
ref_out = layer._forward(x)
|
||||||
assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
|
assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
|
||||||
|
|||||||
@@ -6,17 +6,22 @@ import torch
|
|||||||
from xformers import ops as xops
|
from xformers import ops as xops
|
||||||
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
|
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
|
||||||
|
|
||||||
from vllm import attention_ops
|
from vllm._C import ops
|
||||||
|
from vllm.utils import get_max_shared_memory_bytes
|
||||||
|
|
||||||
MAX_SEQ_LEN = 8192
|
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
|
||||||
NUM_BLOCKS = 128 # Arbitrary values for testing
|
# This will change depending on the compute capability.
|
||||||
|
# - 512 as a buffer
|
||||||
|
MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
|
||||||
|
NUM_BLOCKS = 40000 # Arbitrary values for testing
|
||||||
|
PARTITION_SIZE = 512
|
||||||
|
|
||||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||||
NUM_GEN_SEQS = [7] # Arbitrary values for testing
|
NUM_GEN_SEQS = [7] # Arbitrary values for testing
|
||||||
NUM_PREFILL_SEQS = [1, 3, 7] # Arbitrary values for testing
|
NUM_PREFILL_SEQS = [3] # Arbitrary values for testing
|
||||||
NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing
|
NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing
|
||||||
HEAD_SIZES = [64, 80, 96, 112, 128, 256]
|
HEAD_SIZES = [64, 80, 96, 112, 128, 256]
|
||||||
BLOCK_SIZES = [8, 16, 32]
|
BLOCK_SIZES = [16, 32]
|
||||||
USE_ALIBI = [False, True]
|
USE_ALIBI = [False, True]
|
||||||
SEEDS = [0]
|
SEEDS = [0]
|
||||||
|
|
||||||
@@ -92,6 +97,7 @@ def ref_single_query_cached_kv_attention(
|
|||||||
output[i].copy_(out, non_blocking=True)
|
output[i].copy_(out, non_blocking=True)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("version", ["v1", "v2"])
|
||||||
@pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS)
|
@pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS)
|
||||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||||
@@ -99,9 +105,9 @@ def ref_single_query_cached_kv_attention(
|
|||||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||||
@pytest.mark.parametrize("dtype", DTYPES)
|
@pytest.mark.parametrize("dtype", DTYPES)
|
||||||
@pytest.mark.parametrize("seed", SEEDS)
|
@pytest.mark.parametrize("seed", SEEDS)
|
||||||
@torch.inference_mode()
|
def test_paged_attention(
|
||||||
def test_single_query_cached_kv_attention(
|
|
||||||
kv_cache_factory,
|
kv_cache_factory,
|
||||||
|
version: str,
|
||||||
num_seqs: int,
|
num_seqs: int,
|
||||||
num_heads: Tuple[int, int],
|
num_heads: Tuple[int, int],
|
||||||
head_size: int,
|
head_size: int,
|
||||||
@@ -135,6 +141,7 @@ def test_single_query_cached_kv_attention(
|
|||||||
device="cuda")
|
device="cuda")
|
||||||
|
|
||||||
context_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
|
context_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
|
||||||
|
context_lens[-1] = MAX_SEQ_LEN
|
||||||
max_context_len = max(context_lens)
|
max_context_len = max(context_lens)
|
||||||
context_lens = torch.tensor(context_lens, dtype=torch.int, device="cuda")
|
context_lens = torch.tensor(context_lens, dtype=torch.int, device="cuda")
|
||||||
|
|
||||||
@@ -157,19 +164,54 @@ def test_single_query_cached_kv_attention(
|
|||||||
|
|
||||||
# Call the paged attention kernel.
|
# Call the paged attention kernel.
|
||||||
output = torch.empty_like(query)
|
output = torch.empty_like(query)
|
||||||
attention_ops.single_query_cached_kv_attention(
|
if version == "v1":
|
||||||
output,
|
ops.paged_attention_v1(
|
||||||
query,
|
output,
|
||||||
key_cache,
|
query,
|
||||||
value_cache,
|
key_cache,
|
||||||
head_mapping,
|
value_cache,
|
||||||
scale,
|
head_mapping,
|
||||||
block_tables,
|
scale,
|
||||||
context_lens,
|
block_tables,
|
||||||
block_size,
|
context_lens,
|
||||||
max_context_len,
|
block_size,
|
||||||
alibi_slopes,
|
max_context_len,
|
||||||
)
|
alibi_slopes,
|
||||||
|
)
|
||||||
|
elif version == "v2":
|
||||||
|
num_partitions = ((max_context_len + PARTITION_SIZE - 1) //
|
||||||
|
PARTITION_SIZE)
|
||||||
|
assert PARTITION_SIZE % block_size == 0
|
||||||
|
num_seqs, num_heads, head_size = output.shape
|
||||||
|
tmp_output = torch.empty(
|
||||||
|
size=(num_seqs, num_heads, num_partitions, head_size),
|
||||||
|
dtype=output.dtype,
|
||||||
|
device=output.device,
|
||||||
|
)
|
||||||
|
exp_sums = torch.empty(
|
||||||
|
size=(num_seqs, num_heads, num_partitions),
|
||||||
|
dtype=torch.float32,
|
||||||
|
device=output.device,
|
||||||
|
)
|
||||||
|
max_logits = torch.empty_like(exp_sums)
|
||||||
|
ops.paged_attention_v2(
|
||||||
|
output,
|
||||||
|
exp_sums,
|
||||||
|
max_logits,
|
||||||
|
tmp_output,
|
||||||
|
query,
|
||||||
|
key_cache,
|
||||||
|
value_cache,
|
||||||
|
head_mapping,
|
||||||
|
scale,
|
||||||
|
block_tables,
|
||||||
|
context_lens,
|
||||||
|
block_size,
|
||||||
|
max_context_len,
|
||||||
|
alibi_slopes,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise AssertionError(f"Unknown version: {version}")
|
||||||
|
|
||||||
# Run the reference implementation.
|
# Run the reference implementation.
|
||||||
ref_output = torch.empty_like(query)
|
ref_output = torch.empty_like(query)
|
||||||
@@ -242,7 +284,11 @@ def test_multi_query_kv_attention(
|
|||||||
torch.random.manual_seed(seed)
|
torch.random.manual_seed(seed)
|
||||||
torch.cuda.manual_seed(seed)
|
torch.cuda.manual_seed(seed)
|
||||||
|
|
||||||
seq_lens = random.sample(range(1, MAX_SEQ_LEN), num_seqs)
|
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
|
||||||
|
# As the xformers library is already tested with its own tests, we can use
|
||||||
|
# a smaller MAX_SEQ_LEN here.
|
||||||
|
max_len = min(MAX_SEQ_LEN, 4096)
|
||||||
|
seq_lens = random.sample(range(1, max_len), num_seqs)
|
||||||
num_tokens = sum(seq_lens)
|
num_tokens = sum(seq_lens)
|
||||||
|
|
||||||
scale = float(1.0 / (head_size**0.5))
|
scale = float(1.0 / (head_size**0.5))
|
||||||
|
|||||||
@@ -3,16 +3,16 @@ import random
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import cache_ops
|
from vllm._C import cache_ops
|
||||||
|
|
||||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||||
NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing
|
NUM_TOKENS = [83] # Arbitrary values for testing
|
||||||
NUM_LAYERS = [5] # Arbitrary values for testing
|
NUM_LAYERS = [1] # Arbitrary values for testing
|
||||||
NUM_HEADS = [8] # Arbitrary values for testing
|
NUM_HEADS = [8] # Arbitrary values for testing
|
||||||
HEAD_SIZES = [64, 80, 96, 112, 128, 256]
|
HEAD_SIZES = [64, 80, 96, 112, 128, 256]
|
||||||
BLOCK_SIZES = [8, 16, 32]
|
BLOCK_SIZES = [8, 16, 32]
|
||||||
NUM_BLOCKS = [1024] # Arbitrary values for testing
|
NUM_BLOCKS = [1024, 36000] # Arbitrary values for testing
|
||||||
NUM_MAPPINGS = [32, 256] # Arbitrary values for testing
|
NUM_MAPPINGS = [256] # Arbitrary values for testing
|
||||||
SEEDS = [0]
|
SEEDS = [0]
|
||||||
|
|
||||||
|
|
||||||
@@ -69,9 +69,9 @@ def test_copy_blocks(
|
|||||||
for src, dsts in block_mapping.items():
|
for src, dsts in block_mapping.items():
|
||||||
for dst in dsts:
|
for dst in dsts:
|
||||||
for cloned_key_cache in cloned_key_caches:
|
for cloned_key_cache in cloned_key_caches:
|
||||||
cloned_key_cache[dst] = cloned_key_cache[src]
|
cloned_key_cache[dst].copy_(cloned_key_cache[src])
|
||||||
for cloned_value_cache in cloned_value_caches:
|
for cloned_value_cache in cloned_value_caches:
|
||||||
cloned_value_cache[dst] = cloned_value_cache[src]
|
cloned_value_cache[dst].copy_(cloned_value_cache[src])
|
||||||
|
|
||||||
# Compare the results.
|
# Compare the results.
|
||||||
for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
|
for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
|
||||||
@@ -106,14 +106,14 @@ def test_reshape_and_cache(
|
|||||||
# Create a random slot mapping.
|
# Create a random slot mapping.
|
||||||
num_slots = block_size * num_blocks
|
num_slots = block_size * num_blocks
|
||||||
slot_mapping = random.sample(range(num_slots), num_tokens)
|
slot_mapping = random.sample(range(num_slots), num_tokens)
|
||||||
slot_mapping = torch.tensor(slot_mapping, dtype=torch.int, device='cuda')
|
slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device="cuda")
|
||||||
|
|
||||||
qkv = torch.randn(num_tokens,
|
qkv = torch.randn(num_tokens,
|
||||||
3,
|
3,
|
||||||
num_heads,
|
num_heads,
|
||||||
head_size,
|
head_size,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
device='cuda')
|
device="cuda")
|
||||||
_, key, value = qkv.unbind(dim=1)
|
_, key, value = qkv.unbind(dim=1)
|
||||||
|
|
||||||
# Create the KV caches.
|
# Create the KV caches.
|
||||||
@@ -132,7 +132,7 @@ def test_reshape_and_cache(
|
|||||||
|
|
||||||
# Run the reference implementation.
|
# Run the reference implementation.
|
||||||
reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
|
reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
|
||||||
block_indicies = torch.div(slot_mapping, block_size, rounding_mode='floor')
|
block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
|
||||||
block_indicies = block_indicies.cpu().tolist()
|
block_indicies = block_indicies.cpu().tolist()
|
||||||
block_offsets = slot_mapping % block_size
|
block_offsets = slot_mapping % block_size
|
||||||
block_offsets = block_offsets.cpu().tolist()
|
block_offsets = block_offsets.cpu().tolist()
|
||||||
|
|||||||
@@ -1,58 +1,47 @@
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
|
||||||
|
|
||||||
from vllm import layernorm_ops
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
|
|
||||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||||
HIDDEN_SIZES = [67, 768, 2048, 5120, 8192] # Arbitrary values for testing
|
|
||||||
NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing
|
NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing
|
||||||
|
HIDDEN_SIZES = [768, 5120, 8192] # Arbitrary values for testing
|
||||||
|
ADD_RESIDUAL = [False, True]
|
||||||
SEEDS = [0]
|
SEEDS = [0]
|
||||||
|
|
||||||
|
|
||||||
class RefRMSNorm(nn.Module):
|
|
||||||
|
|
||||||
def __init__(self, hidden_size, eps=1e-6):
|
|
||||||
super().__init__()
|
|
||||||
weight = torch.empty(hidden_size)
|
|
||||||
weight.normal_(mean=1.0, std=0.1)
|
|
||||||
self.weight = nn.Parameter(weight)
|
|
||||||
self.variance_epsilon = eps
|
|
||||||
|
|
||||||
def forward(self, hidden_states):
|
|
||||||
input_dtype = hidden_states.dtype
|
|
||||||
hidden_states = hidden_states.to(torch.float32)
|
|
||||||
variance = hidden_states.pow(2).mean(-1, keepdim=True)
|
|
||||||
hidden_states = hidden_states * torch.rsqrt(variance +
|
|
||||||
self.variance_epsilon)
|
|
||||||
return self.weight * hidden_states.to(input_dtype)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
||||||
|
@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
|
||||||
@pytest.mark.parametrize("dtype", DTYPES)
|
@pytest.mark.parametrize("dtype", DTYPES)
|
||||||
@pytest.mark.parametrize("seed", SEEDS)
|
@pytest.mark.parametrize("seed", SEEDS)
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_rms_norm(
|
def test_rms_norm(
|
||||||
num_tokens: int,
|
num_tokens: int,
|
||||||
hidden_size: int,
|
hidden_size: int,
|
||||||
|
add_residual: bool,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
seed: int,
|
seed: int,
|
||||||
) -> None:
|
) -> None:
|
||||||
torch.random.manual_seed(seed)
|
torch.random.manual_seed(seed)
|
||||||
torch.cuda.manual_seed(seed)
|
torch.cuda.manual_seed(seed)
|
||||||
|
|
||||||
scale = float(hidden_size**-0.5)
|
layer = RMSNorm(hidden_size).to(dtype).cuda()
|
||||||
x = torch.empty(num_tokens, hidden_size, dtype=dtype, device="cuda")
|
layer.weight.data.normal_(mean=1.0, std=0.1)
|
||||||
x.uniform_(-scale, scale)
|
scale = 1 / (2 * hidden_size)
|
||||||
ref = RefRMSNorm(hidden_size).to(dtype).cuda()
|
x = torch.randn(num_tokens, hidden_size, dtype=dtype, device="cuda")
|
||||||
|
x *= scale
|
||||||
|
residual = torch.randn_like(x) * scale if add_residual else None
|
||||||
|
|
||||||
out = torch.empty_like(x)
|
# NOTE(woosuk): The reference implementation should be executed first
|
||||||
layernorm_ops.rms_norm(
|
# because the custom kernel is in-place.
|
||||||
out,
|
ref_out = layer._forward(x, residual)
|
||||||
x,
|
out = layer(x, residual)
|
||||||
ref.weight.data,
|
# NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
|
||||||
ref.variance_epsilon,
|
# numerical errors than other operators because they involve reductions.
|
||||||
)
|
# Therefore, we use a larger tolerance.
|
||||||
ref_out = ref(x)
|
if add_residual:
|
||||||
assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-5)
|
assert torch.allclose(out[0], ref_out[0], atol=1e-2, rtol=1e-2)
|
||||||
|
assert torch.allclose(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
|
||||||
|
else:
|
||||||
|
assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-2)
|
||||||
|
|||||||
@@ -1,105 +1,23 @@
|
|||||||
from typing import Optional, Tuple
|
from typing import Optional
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
|
||||||
import torch.nn.functional as F
|
|
||||||
|
|
||||||
from vllm import pos_encoding_ops
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||||
|
|
||||||
IS_NEOX_STYLE = [True, False]
|
IS_NEOX_STYLE = [True, False]
|
||||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||||
HEAD_SIZES = [64, 80, 96, 112, 128, 256]
|
HEAD_SIZES = [64, 80, 96, 112, 128, 256]
|
||||||
ROTARY_DIMS = [None, 32] # None means rotary dim == head size
|
ROTARY_DIMS = [None, 32] # None means rotary dim == head size
|
||||||
NUM_HEADS = [7, 12, 40, 52] # Arbitrary values for testing
|
NUM_HEADS = [7, 17] # Arbitrary values for testing
|
||||||
NUM_TOKENS = [11, 83, 2048] # Arbitrary values for testing
|
BATCH_SIZES = [1, 5] # Arbitrary values for testing
|
||||||
|
SEQ_LENS = [11, 8192] # Arbitrary values for testing
|
||||||
SEEDS = [0]
|
SEEDS = [0]
|
||||||
|
|
||||||
|
|
||||||
def rotate_neox(x: torch.Tensor) -> torch.Tensor:
|
|
||||||
x1 = x[..., :x.shape[-1] // 2]
|
|
||||||
x2 = x[..., x.shape[-1] // 2:]
|
|
||||||
return torch.cat((-x2, x1), dim=-1)
|
|
||||||
|
|
||||||
|
|
||||||
def rotate_gptj(x: torch.Tensor) -> torch.Tensor:
|
|
||||||
x1 = x[..., ::2]
|
|
||||||
x2 = x[..., 1::2]
|
|
||||||
x = torch.stack((-x2, x1), dim=-1)
|
|
||||||
return x.flatten(-2)
|
|
||||||
|
|
||||||
|
|
||||||
def apply_rope(
|
|
||||||
q: torch.Tensor,
|
|
||||||
k: torch.Tensor,
|
|
||||||
cos: torch.Tensor,
|
|
||||||
sin: torch.Tensor,
|
|
||||||
is_neox_style: bool,
|
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
|
||||||
rotate_fn = rotate_neox if is_neox_style else rotate_gptj
|
|
||||||
q_embed = (q * cos) + (rotate_fn(q) * sin)
|
|
||||||
k_embed = (k * cos) + (rotate_fn(k) * sin)
|
|
||||||
return q_embed, k_embed
|
|
||||||
|
|
||||||
|
|
||||||
class RefRotaryEmbedding(nn.Module):
|
|
||||||
"""Reference implementation of rotary embedding."""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
dim: int,
|
|
||||||
is_neox_style: bool,
|
|
||||||
max_position_embeddings: int = 8192,
|
|
||||||
base: int = 10000,
|
|
||||||
) -> None:
|
|
||||||
super().__init__()
|
|
||||||
self.rotary_dim = dim
|
|
||||||
self.is_neox_style = is_neox_style
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
|
|
||||||
# Create cos and sin embeddings.
|
|
||||||
inv_freq = 1.0 / (base**(torch.arange(0, dim, 2) / dim))
|
|
||||||
t = torch.arange(max_position_embeddings).float()
|
|
||||||
freqs = torch.einsum("i,j->ij", t, inv_freq.float())
|
|
||||||
if is_neox_style:
|
|
||||||
emb = torch.cat((freqs, freqs), dim=-1)
|
|
||||||
else:
|
|
||||||
emb = torch.repeat_interleave(freqs, 2, -1)
|
|
||||||
cos = emb.cos().to(dtype=inv_freq.dtype)
|
|
||||||
sin = emb.sin().to(dtype=inv_freq.dtype)
|
|
||||||
self.register_buffer("cos_cached", cos, persistent=False)
|
|
||||||
self.register_buffer("sin_cached", sin, persistent=False)
|
|
||||||
|
|
||||||
def forward(
|
|
||||||
self,
|
|
||||||
positions: torch.Tensor, # [num_tokens]
|
|
||||||
query: torch.Tensor, # [num_tokens, num_heads, head_size]
|
|
||||||
key: torch.Tensor, # [num_tokens, num_heads, head_size]
|
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
|
||||||
query_rot = query[..., :self.rotary_dim]
|
|
||||||
query_pass = query[..., self.rotary_dim:]
|
|
||||||
key_rot = key[..., :self.rotary_dim]
|
|
||||||
key_pass = key[..., self.rotary_dim:]
|
|
||||||
|
|
||||||
query_rot = query_rot.transpose(0, 1)
|
|
||||||
key_rot = key_rot.transpose(0, 1)
|
|
||||||
cos = F.embedding(positions, self.cos_cached)
|
|
||||||
sin = F.embedding(positions, self.sin_cached)
|
|
||||||
|
|
||||||
query_rot, key_rot = apply_rope(query_rot, key_rot, cos, sin,
|
|
||||||
self.is_neox_style)
|
|
||||||
query_rot = query_rot.transpose(0, 1).contiguous()
|
|
||||||
key_rot = key_rot.transpose(0, 1).contiguous()
|
|
||||||
|
|
||||||
query = torch.cat((query_rot, query_pass), dim=-1)
|
|
||||||
key = torch.cat((key_rot, key_pass), dim=-1)
|
|
||||||
|
|
||||||
# Output query/key shape: [num_tokens, num_tokens, head_size]
|
|
||||||
return query, key
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
|
@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
|
||||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
@pytest.mark.parametrize("batch_size", BATCH_SIZES)
|
||||||
|
@pytest.mark.parametrize("seq_len", SEQ_LENS)
|
||||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||||
@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
|
@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
|
||||||
@@ -108,7 +26,8 @@ class RefRotaryEmbedding(nn.Module):
|
|||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_rotary_embedding(
|
def test_rotary_embedding(
|
||||||
is_neox_style: bool,
|
is_neox_style: bool,
|
||||||
num_tokens: int,
|
batch_size: int,
|
||||||
|
seq_len: int,
|
||||||
num_heads: int,
|
num_heads: int,
|
||||||
head_size: int,
|
head_size: int,
|
||||||
rotary_dim: Optional[int],
|
rotary_dim: Optional[int],
|
||||||
@@ -122,52 +41,25 @@ def test_rotary_embedding(
|
|||||||
torch.random.manual_seed(seed)
|
torch.random.manual_seed(seed)
|
||||||
torch.cuda.manual_seed(seed)
|
torch.cuda.manual_seed(seed)
|
||||||
|
|
||||||
positions = torch.randint(0, max_position, (num_tokens, ), device="cuda")
|
if rotary_dim is None:
|
||||||
query = torch.randn(num_tokens,
|
rotary_dim = head_size
|
||||||
|
rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
|
||||||
|
rope = rope.to(dtype).cuda()
|
||||||
|
|
||||||
|
positions = torch.randint(0,
|
||||||
|
max_position, (batch_size, seq_len),
|
||||||
|
device="cuda")
|
||||||
|
query = torch.randn(batch_size,
|
||||||
|
seq_len,
|
||||||
num_heads * head_size,
|
num_heads * head_size,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
device="cuda")
|
device="cuda")
|
||||||
key = torch.randn(num_tokens,
|
key = torch.randn_like(query)
|
||||||
num_heads * head_size,
|
|
||||||
dtype=dtype,
|
|
||||||
device="cuda")
|
|
||||||
|
|
||||||
# Create the rotary embedding.
|
|
||||||
inv_freq = 1.0 / (base**(torch.arange(0, rotary_dim, 2) / rotary_dim))
|
|
||||||
t = torch.arange(max_position).float()
|
|
||||||
freqs = torch.einsum("i,j -> ij", t, inv_freq.float())
|
|
||||||
cos = freqs.cos()
|
|
||||||
sin = freqs.sin()
|
|
||||||
cos_sin_cache = torch.cat((cos, sin), dim=-1)
|
|
||||||
cos_sin_cache = cos_sin_cache.to(dtype=dtype, device='cuda')
|
|
||||||
|
|
||||||
# Run the kernel. The kernel is in-place, so we need to clone the inputs.
|
|
||||||
out_query = query.clone()
|
|
||||||
out_key = key.clone()
|
|
||||||
pos_encoding_ops.rotary_embedding(
|
|
||||||
positions,
|
|
||||||
out_query,
|
|
||||||
out_key,
|
|
||||||
head_size,
|
|
||||||
cos_sin_cache,
|
|
||||||
is_neox_style,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Run the reference implementation.
|
|
||||||
ref_rotary_embedding = RefRotaryEmbedding(
|
|
||||||
dim=rotary_dim,
|
|
||||||
is_neox_style=is_neox_style,
|
|
||||||
max_position_embeddings=max_position,
|
|
||||||
base=base,
|
|
||||||
).to(dtype=dtype, device="cuda")
|
|
||||||
ref_query, ref_key = ref_rotary_embedding(
|
|
||||||
positions,
|
|
||||||
query.view(num_tokens, num_heads, head_size),
|
|
||||||
key.view(num_tokens, num_heads, head_size),
|
|
||||||
)
|
|
||||||
ref_query = ref_query.view(num_tokens, num_heads * head_size)
|
|
||||||
ref_key = ref_key.view(num_tokens, num_heads * head_size)
|
|
||||||
|
|
||||||
|
# NOTE(woosuk): The reference implementation should be executed first
|
||||||
|
# because the custom kernel is in-place.
|
||||||
|
ref_query, ref_key = rope._forward(positions, query, key)
|
||||||
|
out_query, out_key = rope.forward(positions, query, key)
|
||||||
# Compare the results.
|
# Compare the results.
|
||||||
assert torch.allclose(out_query, ref_query, atol=1e-5, rtol=1e-5)
|
assert torch.allclose(out_query, ref_query, atol=1e-5, rtol=1e-5)
|
||||||
assert torch.allclose(out_key, ref_key, atol=1e-5, rtol=1e-5)
|
assert torch.allclose(out_key, ref_key, atol=1e-5, rtol=1e-5)
|
||||||
|
|||||||
@@ -6,14 +6,16 @@ import pytest
|
|||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
"facebook/opt-125m",
|
"facebook/opt-125m",
|
||||||
|
"meta-llama/Llama-2-7b-hf",
|
||||||
|
"mistralai/Mistral-7B-v0.1",
|
||||||
|
"tiiuae/falcon-7b",
|
||||||
"gpt2",
|
"gpt2",
|
||||||
"bigcode/tiny_starcoder_py",
|
"bigcode/tiny_starcoder_py",
|
||||||
"EleutherAI/gpt-j-6b",
|
"EleutherAI/gpt-j-6b",
|
||||||
"EleutherAI/pythia-70m",
|
"EleutherAI/pythia-70m",
|
||||||
"bigscience/bloom-560m",
|
"bigscience/bloom-560m",
|
||||||
"mosaicml/mpt-7b",
|
"mosaicml/mpt-7b",
|
||||||
"tiiuae/falcon-7b",
|
"microsoft/phi-1_5",
|
||||||
"meta-llama/Llama-2-7b-hf",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
55
tests/samplers/test_logprobs.py
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
import pytest
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm import SamplingParams
|
||||||
|
|
||||||
|
MODELS = ["facebook/opt-125m"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
|
@pytest.mark.parametrize("dtype", ["half"])
|
||||||
|
def test_get_prompt_logprobs(
|
||||||
|
hf_runner,
|
||||||
|
vllm_runner,
|
||||||
|
model,
|
||||||
|
dtype,
|
||||||
|
example_prompts,
|
||||||
|
):
|
||||||
|
max_tokens = 5
|
||||||
|
hf_model = hf_runner(model, dtype=dtype)
|
||||||
|
hf_logprobs = hf_model.generate_greedy_logprobs(
|
||||||
|
example_prompts,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
)
|
||||||
|
del hf_model
|
||||||
|
|
||||||
|
vllm_model = vllm_runner(model, dtype=dtype)
|
||||||
|
vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
|
||||||
|
logprobs=5,
|
||||||
|
prompt_logprobs=5,
|
||||||
|
temperature=0.0)
|
||||||
|
vllm_results = vllm_model.model.generate(
|
||||||
|
example_prompts, sampling_params=vllm_sampling_params)
|
||||||
|
|
||||||
|
# Test whether logprobs are included in the results.
|
||||||
|
for result in vllm_results:
|
||||||
|
assert result.prompt_logprobs is not None
|
||||||
|
assert result.outputs[0].logprobs is not None
|
||||||
|
|
||||||
|
# Test whether prompt logprobs are consistent with HF
|
||||||
|
for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs):
|
||||||
|
# Check prompt logprobs
|
||||||
|
vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
|
||||||
|
for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
|
||||||
|
for token_id, logprob in vllm_prompt_logprob_dict.items():
|
||||||
|
torch.testing.assert_close(logprob,
|
||||||
|
hf_logprob[0][i][token_id].item(),
|
||||||
|
atol=1e-2,
|
||||||
|
rtol=1e-2)
|
||||||
|
vllm_sample_logprobs = vllm_result.outputs[0].logprobs
|
||||||
|
for i, vllm_sample_logprob_dict in enumerate(vllm_sample_logprobs):
|
||||||
|
for token_id, logprob in vllm_sample_logprob_dict.items():
|
||||||
|
torch.testing.assert_close(logprob,
|
||||||
|
hf_logprob[i][-1][token_id].item(),
|
||||||
|
atol=1e-2,
|
||||||
|
rtol=1e-2)
|
||||||
235
tests/samplers/test_sampler.py
Normal file
@@ -0,0 +1,235 @@
|
|||||||
|
import random
|
||||||
|
from typing import Tuple
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm.model_executor.layers.sampler import Sampler
|
||||||
|
from vllm.model_executor.utils import set_random_seed
|
||||||
|
from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
|
||||||
|
from vllm.worker.model_runner import ModelRunner
|
||||||
|
|
||||||
|
|
||||||
|
class MockLogitsSampler(Sampler):
|
||||||
|
|
||||||
|
def __init__(self, vocab_size: int, fake_logits: torch.Tensor):
|
||||||
|
super().__init__(vocab_size=vocab_size)
|
||||||
|
self.fake_logits = fake_logits
|
||||||
|
|
||||||
|
def forward(self, *args, **kwargs):
|
||||||
|
with patch("vllm.model_executor.layers.sampler._prune_hidden_states",
|
||||||
|
lambda x, y: x), patch(
|
||||||
|
"vllm.model_executor.layers.sampler._get_logits",
|
||||||
|
lambda *args, **kwargs: self.fake_logits):
|
||||||
|
return super().forward(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def _prepare_test(
|
||||||
|
batch_size: int
|
||||||
|
) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler, ModelRunner]:
|
||||||
|
vocab_size = 32000
|
||||||
|
input_tensor = torch.rand((batch_size, 1024),
|
||||||
|
device="cuda",
|
||||||
|
dtype=torch.float16)
|
||||||
|
fake_logits = torch.full((batch_size, vocab_size),
|
||||||
|
1e-2,
|
||||||
|
device=input_tensor.device,
|
||||||
|
dtype=input_tensor.dtype)
|
||||||
|
sampler = MockLogitsSampler(32000, fake_logits)
|
||||||
|
model_runner = ModelRunner(None, None, None)
|
||||||
|
return input_tensor, fake_logits, sampler, model_runner
|
||||||
|
|
||||||
|
|
||||||
|
RANDOM_SEEDS = list(range(128))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
||||||
|
def test_sampler_all_greedy(seed: int):
|
||||||
|
set_random_seed(seed)
|
||||||
|
batch_size = random.randint(1, 256)
|
||||||
|
input_tensor, fake_logits, sampler, model_runner = _prepare_test(
|
||||||
|
batch_size)
|
||||||
|
|
||||||
|
seq_group_metadata_list = []
|
||||||
|
prompt_lens = []
|
||||||
|
for i in range(batch_size):
|
||||||
|
seq_group_metadata_list.append(
|
||||||
|
SequenceGroupMetadata(
|
||||||
|
request_id=f"test_{i}",
|
||||||
|
is_prompt=True,
|
||||||
|
seq_data={0: SequenceData([1, 2, 3])},
|
||||||
|
sampling_params=SamplingParams(temperature=0, ),
|
||||||
|
block_tables={0: [1]},
|
||||||
|
))
|
||||||
|
prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
|
||||||
|
|
||||||
|
sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
|
||||||
|
prompt_lens)
|
||||||
|
sampler_output = sampler(embedding=None,
|
||||||
|
hidden_states=input_tensor,
|
||||||
|
sampling_metadata=sampling_metadata)
|
||||||
|
expected = torch.argmax(fake_logits, dim=-1)
|
||||||
|
for i, sequence_output in enumerate(sampler_output):
|
||||||
|
for nth_output in sequence_output.samples:
|
||||||
|
assert nth_output.output_token == expected[i].item()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
||||||
|
def test_sampler_all_random(seed: int):
|
||||||
|
set_random_seed(seed)
|
||||||
|
batch_size = random.randint(1, 256)
|
||||||
|
input_tensor, fake_logits, sampler, model_runner = _prepare_test(
|
||||||
|
batch_size)
|
||||||
|
|
||||||
|
for i in range(batch_size):
|
||||||
|
fake_logits[i, i] = 1e2
|
||||||
|
|
||||||
|
seq_group_metadata_list = []
|
||||||
|
prompt_lens = []
|
||||||
|
for i in range(batch_size):
|
||||||
|
seq_group_metadata_list.append(
|
||||||
|
SequenceGroupMetadata(
|
||||||
|
request_id=f"test_{i}",
|
||||||
|
is_prompt=True,
|
||||||
|
seq_data={0: SequenceData([1, 2, 3])},
|
||||||
|
sampling_params=SamplingParams(
|
||||||
|
temperature=1.0,
|
||||||
|
n=random.randint(1, 10),
|
||||||
|
),
|
||||||
|
block_tables={0: [1]},
|
||||||
|
))
|
||||||
|
prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
|
||||||
|
|
||||||
|
sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
|
||||||
|
prompt_lens)
|
||||||
|
sampler_output = sampler(embedding=None,
|
||||||
|
hidden_states=input_tensor,
|
||||||
|
sampling_metadata=sampling_metadata)
|
||||||
|
for i, sequence_output in enumerate(sampler_output):
|
||||||
|
for nth_output in sequence_output.samples:
|
||||||
|
assert nth_output.output_token == i
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
||||||
|
def test_sampler_all_beam(seed: int):
|
||||||
|
set_random_seed(seed)
|
||||||
|
batch_size = random.randint(1, 256)
|
||||||
|
input_tensor, _, sampler, model_runner = _prepare_test(batch_size)
|
||||||
|
|
||||||
|
seq_group_metadata_list = []
|
||||||
|
prompt_lens = []
|
||||||
|
for i in range(batch_size):
|
||||||
|
seq_group_metadata_list.append(
|
||||||
|
SequenceGroupMetadata(
|
||||||
|
request_id=f"test_{i}",
|
||||||
|
is_prompt=True,
|
||||||
|
seq_data={0: SequenceData([1, 2, 3])},
|
||||||
|
sampling_params=SamplingParams(
|
||||||
|
temperature=0,
|
||||||
|
best_of=2,
|
||||||
|
use_beam_search=True,
|
||||||
|
),
|
||||||
|
block_tables={0: [1]},
|
||||||
|
))
|
||||||
|
prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
|
||||||
|
|
||||||
|
sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
|
||||||
|
prompt_lens)
|
||||||
|
sampler(embedding=None,
|
||||||
|
hidden_states=input_tensor,
|
||||||
|
sampling_metadata=sampling_metadata)
|
||||||
|
# no assertion here as I am not sure how to determine whether
|
||||||
|
# the outputs are expected - in other words, this just tests
|
||||||
|
# whether there are no exceptions in the sampler
|
||||||
|
# when handling an all-beam search case.
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
||||||
|
def test_sampler_mixed(seed: int):
|
||||||
|
set_random_seed(seed)
|
||||||
|
batch_size = random.randint(1, 256)
|
||||||
|
input_tensor, fake_logits, sampler, model_runner = _prepare_test(
|
||||||
|
batch_size)
|
||||||
|
|
||||||
|
seq_group_metadata_list = []
|
||||||
|
expected_tokens = []
|
||||||
|
prompt_lens = []
|
||||||
|
for i in range(batch_size):
|
||||||
|
n = 1
|
||||||
|
sampling_type = random.randint(0, 2)
|
||||||
|
if sampling_type == 0:
|
||||||
|
sampling_params = SamplingParams(temperature=0)
|
||||||
|
elif sampling_type == 1:
|
||||||
|
n = random.randint(1, 10)
|
||||||
|
sampling_params = SamplingParams(
|
||||||
|
temperature=random.random() + 0.1,
|
||||||
|
top_p=min(random.random() + 0.1, 1),
|
||||||
|
top_k=random.randint(0, 10) or -1,
|
||||||
|
n=n,
|
||||||
|
presence_penalty=random.randint(0, 1),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
sampling_params = SamplingParams(temperature=0,
|
||||||
|
use_beam_search=True,
|
||||||
|
best_of=2)
|
||||||
|
for idx in range(n):
|
||||||
|
fake_logits[i, i + idx] = 1e2
|
||||||
|
expected_tokens.append(i + idx)
|
||||||
|
seq_group_metadata_list.append(
|
||||||
|
SequenceGroupMetadata(
|
||||||
|
request_id=f"test_{i}",
|
||||||
|
is_prompt=True,
|
||||||
|
seq_data={0: SequenceData([1, 2, 3])},
|
||||||
|
sampling_params=sampling_params,
|
||||||
|
block_tables={0: [1]},
|
||||||
|
))
|
||||||
|
prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
|
||||||
|
|
||||||
|
sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
|
||||||
|
prompt_lens)
|
||||||
|
sampler_output = sampler(embedding=None,
|
||||||
|
hidden_states=input_tensor,
|
||||||
|
sampling_metadata=sampling_metadata)
|
||||||
|
for i, sequence_output in enumerate(sampler_output):
|
||||||
|
if seq_group_metadata_list[i].sampling_params.use_beam_search:
|
||||||
|
continue
|
||||||
|
for nth_output in sequence_output.samples:
|
||||||
|
assert nth_output.output_token in expected_tokens
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
||||||
|
def test_sampler_logits_processors(seed: int):
|
||||||
|
set_random_seed(seed)
|
||||||
|
batch_size = random.randint(1, 256)
|
||||||
|
input_tensor, _, sampler, model_runner = _prepare_test(batch_size)
|
||||||
|
|
||||||
|
# This sample logits processor gives infinite score to the i-th token,
|
||||||
|
# where i is the length of the input sequence.
|
||||||
|
# We therefore expect the output token sequence to be [0, 1, 2, ...]
|
||||||
|
def pick_ith(token_ids, logits):
|
||||||
|
logits[len(token_ids)] = float("inf")
|
||||||
|
return logits
|
||||||
|
|
||||||
|
seq_group_metadata_list = []
|
||||||
|
prompt_lens = []
|
||||||
|
for i in range(batch_size):
|
||||||
|
seq_group_metadata_list.append(
|
||||||
|
SequenceGroupMetadata(
|
||||||
|
request_id=f"test_{i}",
|
||||||
|
is_prompt=True,
|
||||||
|
seq_data={0: SequenceData([1, 2, 3])},
|
||||||
|
sampling_params=SamplingParams(temperature=0,
|
||||||
|
logits_processors=[pick_ith]),
|
||||||
|
block_tables={0: [1]},
|
||||||
|
))
|
||||||
|
prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
|
||||||
|
|
||||||
|
sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
|
||||||
|
prompt_lens)
|
||||||
|
sampler_output = sampler(embedding=None,
|
||||||
|
hidden_states=input_tensor,
|
||||||
|
sampling_metadata=sampling_metadata)
|
||||||
|
for _, sequence_output in enumerate(sampler_output):
|
||||||
|
for idx, nth_output in enumerate(sequence_output.samples):
|
||||||
|
assert nth_output.output_token == idx
|
||||||
27
tests/test_regression.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
"""Containing tests that check for regressions in vLLM's behavior.
|
||||||
|
|
||||||
|
It should include tests that are reported by users and making sure they
|
||||||
|
will never happen again.
|
||||||
|
|
||||||
|
"""
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
|
||||||
|
def test_duplicated_ignored_sequence_group():
|
||||||
|
"""https://github.com/vllm-project/vllm/issues/1655"""
|
||||||
|
|
||||||
|
sampling_params = SamplingParams(temperature=0.01,
|
||||||
|
top_p=0.1,
|
||||||
|
max_tokens=256)
|
||||||
|
llm = LLM(model="facebook/opt-125m",
|
||||||
|
max_num_batched_tokens=4096,
|
||||||
|
tensor_parallel_size=1)
|
||||||
|
prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
|
||||||
|
outputs = llm.generate(prompts, sampling_params=sampling_params)
|
||||||
|
|
||||||
|
assert len(prompts) == len(outputs)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import pytest
|
||||||
|
pytest.main([__file__])
|
||||||
48
tests/worker/test_model_runner.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
import random
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
|
||||||
|
from vllm.worker.model_runner import ModelRunner
|
||||||
|
|
||||||
|
|
||||||
|
def test_prepare_prompt():
|
||||||
|
model_runner = ModelRunner(None, None, None)
|
||||||
|
model_runner.set_block_size(16)
|
||||||
|
|
||||||
|
batch_size = random.randint(1, 256)
|
||||||
|
prompt_lens = []
|
||||||
|
seq_group_metadata_list = []
|
||||||
|
for i in range(batch_size):
|
||||||
|
# make sure all tokens fit into one block
|
||||||
|
prompt_len = i % (model_runner.block_size - 1) + 1
|
||||||
|
prompt_lens.append(prompt_len)
|
||||||
|
seq_data = list(range(prompt_len))
|
||||||
|
seq_group_metadata_list.append(
|
||||||
|
SequenceGroupMetadata(
|
||||||
|
request_id=f"test_{i}",
|
||||||
|
is_prompt=True,
|
||||||
|
seq_data={0: SequenceData(seq_data)},
|
||||||
|
sampling_params=SamplingParams(temperature=0),
|
||||||
|
block_tables={0: [1]},
|
||||||
|
))
|
||||||
|
|
||||||
|
expected_selected_token_indices = []
|
||||||
|
selected_token_start_idx = 0
|
||||||
|
max_seq_len = max(prompt_lens)
|
||||||
|
for prompt_len in prompt_lens:
|
||||||
|
expected_selected_token_indices.append(selected_token_start_idx +
|
||||||
|
prompt_len - 1)
|
||||||
|
selected_token_start_idx += max_seq_len
|
||||||
|
input_tokens, input_positions, _ = model_runner._prepare_prompt(
|
||||||
|
seq_group_metadata_list)
|
||||||
|
sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
|
||||||
|
prompt_lens)
|
||||||
|
assert input_tokens.shape == (batch_size, max_seq_len)
|
||||||
|
assert input_positions.shape == (batch_size, max_seq_len)
|
||||||
|
torch.testing.assert_close(input_tokens, input_positions)
|
||||||
|
|
||||||
|
actual = sampling_metadata.selected_token_indices
|
||||||
|
expected = torch.tensor(expected_selected_token_indices,
|
||||||
|
device=actual.device,
|
||||||
|
dtype=actual.dtype)
|
||||||
|
torch.testing.assert_close(actual, expected)
|
||||||
@@ -8,7 +8,7 @@ from vllm.entrypoints.llm import LLM
|
|||||||
from vllm.outputs import CompletionOutput, RequestOutput
|
from vllm.outputs import CompletionOutput, RequestOutput
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
|
|
||||||
__version__ = "0.1.5"
|
__version__ = "0.2.3"
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"LLM",
|
"LLM",
|
||||||
|
|||||||
265
vllm/config.py
@@ -1,4 +1,5 @@
|
|||||||
from typing import Optional
|
from typing import Optional, Union
|
||||||
|
import os
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
@@ -38,6 +39,16 @@ class ModelConfig:
|
|||||||
will use FP16 precision for FP32 and FP16 models, and BF16 precision
|
will use FP16 precision for FP32 and FP16 models, and BF16 precision
|
||||||
for BF16 models.
|
for BF16 models.
|
||||||
seed: Random seed for reproducibility.
|
seed: Random seed for reproducibility.
|
||||||
|
revision: The specific model version to use. It can be a branch name,
|
||||||
|
a tag name, or a commit id. If unspecified, will use the default
|
||||||
|
version.
|
||||||
|
tokenizer_revision: The specific tokenizer version to use. It can be a
|
||||||
|
branch name, a tag name, or a commit id. If unspecified, will use
|
||||||
|
the default version.
|
||||||
|
max_model_len: Maximum length of a sequence (including prompt and
|
||||||
|
output). If None, will be derived from the model.
|
||||||
|
quantization: Quantization method that was used to quantize the model
|
||||||
|
weights. If None, we assume the model weights are not quantized.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -48,8 +59,12 @@ class ModelConfig:
|
|||||||
trust_remote_code: bool,
|
trust_remote_code: bool,
|
||||||
download_dir: Optional[str],
|
download_dir: Optional[str],
|
||||||
load_format: str,
|
load_format: str,
|
||||||
dtype: str,
|
dtype: Union[str, torch.dtype],
|
||||||
seed: int,
|
seed: int,
|
||||||
|
revision: Optional[str] = None,
|
||||||
|
tokenizer_revision: Optional[str] = None,
|
||||||
|
max_model_len: Optional[int] = None,
|
||||||
|
quantization: Optional[str] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.model = model
|
self.model = model
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
@@ -58,11 +73,28 @@ class ModelConfig:
|
|||||||
self.download_dir = download_dir
|
self.download_dir = download_dir
|
||||||
self.load_format = load_format
|
self.load_format = load_format
|
||||||
self.seed = seed
|
self.seed = seed
|
||||||
|
self.revision = revision
|
||||||
|
self.tokenizer_revision = tokenizer_revision
|
||||||
|
self.quantization = quantization
|
||||||
|
|
||||||
self.hf_config = get_config(model, trust_remote_code)
|
if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true":
|
||||||
|
# download model from ModelScope hub,
|
||||||
|
# lazy import so that modelscope is not required for normal use.
|
||||||
|
from modelscope.hub.snapshot_download import snapshot_download # pylint: disable=C
|
||||||
|
model_path = snapshot_download(model_id=model,
|
||||||
|
cache_dir=download_dir,
|
||||||
|
revision=revision)
|
||||||
|
self.model = model_path
|
||||||
|
self.download_dir = model_path
|
||||||
|
self.tokenizer = model_path
|
||||||
|
|
||||||
|
self.hf_config = get_config(self.model, trust_remote_code, revision)
|
||||||
self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
|
self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
|
||||||
|
self.max_model_len = _get_and_verify_max_len(self.hf_config,
|
||||||
|
max_model_len)
|
||||||
self._verify_load_format()
|
self._verify_load_format()
|
||||||
self._verify_tokenizer_mode()
|
self._verify_tokenizer_mode()
|
||||||
|
self._verify_quantization()
|
||||||
|
|
||||||
def _verify_load_format(self) -> None:
|
def _verify_load_format(self) -> None:
|
||||||
load_format = self.load_format.lower()
|
load_format = self.load_format.lower()
|
||||||
@@ -82,6 +114,33 @@ class ModelConfig:
|
|||||||
"either 'auto' or 'slow'.")
|
"either 'auto' or 'slow'.")
|
||||||
self.tokenizer_mode = tokenizer_mode
|
self.tokenizer_mode = tokenizer_mode
|
||||||
|
|
||||||
|
def _verify_quantization(self) -> None:
|
||||||
|
supported_quantization = ["awq", "squeezellm"]
|
||||||
|
if self.quantization is not None:
|
||||||
|
self.quantization = self.quantization.lower()
|
||||||
|
|
||||||
|
# Parse quantization method from the HF model config, if available.
|
||||||
|
hf_quant_config = getattr(self.hf_config, "quantization_config", None)
|
||||||
|
if hf_quant_config is not None:
|
||||||
|
hf_quant_method = str(hf_quant_config["quant_method"]).lower()
|
||||||
|
if self.quantization is None:
|
||||||
|
self.quantization = hf_quant_method
|
||||||
|
elif self.quantization != hf_quant_method:
|
||||||
|
raise ValueError(
|
||||||
|
"Quantization method specified in the model config "
|
||||||
|
f"({hf_quant_method}) does not match the quantization "
|
||||||
|
f"method specified in the `quantization` argument "
|
||||||
|
f"({self.quantization}).")
|
||||||
|
|
||||||
|
if self.quantization is not None:
|
||||||
|
if self.quantization not in supported_quantization:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unknown quantization method: {self.quantization}. Must "
|
||||||
|
f"be one of {supported_quantization}.")
|
||||||
|
logger.warning(f"{self.quantization} quantization is not fully "
|
||||||
|
"optimized yet. The speed can be slower than "
|
||||||
|
"non-quantized models.")
|
||||||
|
|
||||||
def verify_with_parallel_config(
|
def verify_with_parallel_config(
|
||||||
self,
|
self,
|
||||||
parallel_config: "ParallelConfig",
|
parallel_config: "ParallelConfig",
|
||||||
@@ -102,6 +161,12 @@ class ModelConfig:
|
|||||||
"must be divisible by pipeline parallel size "
|
"must be divisible by pipeline parallel size "
|
||||||
f"({pipeline_parallel_size}).")
|
f"({pipeline_parallel_size}).")
|
||||||
|
|
||||||
|
def get_sliding_window(self) -> Optional[int]:
|
||||||
|
return getattr(self.hf_config, "sliding_window", None)
|
||||||
|
|
||||||
|
def get_vocab_size(self) -> int:
|
||||||
|
return self.hf_config.vocab_size
|
||||||
|
|
||||||
def get_hidden_size(self) -> int:
|
def get_hidden_size(self) -> int:
|
||||||
return self.hf_config.hidden_size
|
return self.hf_config.hidden_size
|
||||||
|
|
||||||
@@ -109,48 +174,49 @@ class ModelConfig:
|
|||||||
# FIXME(woosuk): This may not be true for all models.
|
# FIXME(woosuk): This may not be true for all models.
|
||||||
return self.hf_config.hidden_size // self.hf_config.num_attention_heads
|
return self.hf_config.hidden_size // self.hf_config.num_attention_heads
|
||||||
|
|
||||||
def get_num_heads(self, parallel_config: "ParallelConfig") -> int:
|
def get_total_num_kv_heads(self) -> int:
|
||||||
|
"""Returns the total number of KV heads."""
|
||||||
# For GPTBigCode & Falcon:
|
# For GPTBigCode & Falcon:
|
||||||
# Note: for falcon, when new_decoder_architecture is True, the
|
# NOTE: for falcon, when new_decoder_architecture is True, the
|
||||||
# multi_query flag is ignored and we use n_head_kv for the number of
|
# multi_query flag is ignored and we use n_head_kv for the number of
|
||||||
# KV heads.
|
# KV heads.
|
||||||
|
falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"]
|
||||||
new_decoder_arch_falcon = (
|
new_decoder_arch_falcon = (
|
||||||
self.hf_config.model_type == "falcon"
|
self.hf_config.model_type in falcon_model_types
|
||||||
and getattr(self.hf_config, "new_decoder_architecture", False))
|
and getattr(self.hf_config, "new_decoder_architecture", False))
|
||||||
if not new_decoder_arch_falcon and getattr(self.hf_config,
|
if not new_decoder_arch_falcon and getattr(self.hf_config,
|
||||||
"multi_query", False):
|
"multi_query", False):
|
||||||
# Multi-query attention, only one KV head.
|
# Multi-query attention, only one KV head.
|
||||||
|
# Currently, tensor parallelism is not supported in this case.
|
||||||
return 1
|
return 1
|
||||||
# For Falcon:
|
|
||||||
if getattr(self.hf_config, "n_head_kv", None) is not None:
|
|
||||||
return (self.hf_config.n_head_kv //
|
|
||||||
parallel_config.tensor_parallel_size)
|
|
||||||
# For LLaMA-2:
|
|
||||||
if getattr(self.hf_config, "num_key_value_heads", None) is not None:
|
|
||||||
return (self.hf_config.num_key_value_heads //
|
|
||||||
parallel_config.tensor_parallel_size)
|
|
||||||
total_num_attention_heads = self.hf_config.num_attention_heads
|
|
||||||
return total_num_attention_heads // parallel_config.tensor_parallel_size
|
|
||||||
|
|
||||||
def get_max_model_len(self) -> int:
|
attributes = [
|
||||||
max_model_len = float("inf")
|
# For Falcon:
|
||||||
possible_keys = [
|
"n_head_kv",
|
||||||
# OPT
|
"num_kv_heads",
|
||||||
"max_position_embeddings",
|
# For LLaMA-2:
|
||||||
# GPT-2
|
"num_key_value_heads",
|
||||||
"n_positions",
|
# For ChatGLM:
|
||||||
# MPT
|
"multi_query_group_num",
|
||||||
"max_seq_len",
|
|
||||||
# Others
|
|
||||||
"max_sequence_length",
|
|
||||||
"max_seq_length",
|
|
||||||
"seq_len",
|
|
||||||
]
|
]
|
||||||
for key in possible_keys:
|
for attr in attributes:
|
||||||
max_len_key = getattr(self.hf_config, key, None)
|
num_kv_heads = getattr(self.hf_config, attr, None)
|
||||||
if max_len_key is not None:
|
if num_kv_heads is not None:
|
||||||
max_model_len = min(max_model_len, max_len_key)
|
return num_kv_heads
|
||||||
return max_model_len
|
|
||||||
|
# For non-grouped-query attention models, the number of KV heads is
|
||||||
|
# equal to the number of attention heads.
|
||||||
|
return self.hf_config.num_attention_heads
|
||||||
|
|
||||||
|
def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
|
||||||
|
"""Returns the number of KV heads per GPU."""
|
||||||
|
total_num_kv_heads = self.get_total_num_kv_heads()
|
||||||
|
# If tensor parallelism is used, we divide the number of KV heads by
|
||||||
|
# the tensor parallel size. We will replicate the KV heads in the
|
||||||
|
# case where the number of KV heads is smaller than the tensor
|
||||||
|
# parallel size so each GPU has at least one KV head.
|
||||||
|
return max(1,
|
||||||
|
total_num_kv_heads // parallel_config.tensor_parallel_size)
|
||||||
|
|
||||||
def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
|
def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
|
||||||
total_num_hidden_layers = self.hf_config.num_hidden_layers
|
total_num_hidden_layers = self.hf_config.num_hidden_layers
|
||||||
@@ -172,10 +238,12 @@ class CacheConfig:
|
|||||||
block_size: int,
|
block_size: int,
|
||||||
gpu_memory_utilization: float,
|
gpu_memory_utilization: float,
|
||||||
swap_space: int,
|
swap_space: int,
|
||||||
|
sliding_window: Optional[int] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.block_size = block_size
|
self.block_size = block_size
|
||||||
self.gpu_memory_utilization = gpu_memory_utilization
|
self.gpu_memory_utilization = gpu_memory_utilization
|
||||||
self.swap_space_bytes = swap_space * _GB
|
self.swap_space_bytes = swap_space * _GB
|
||||||
|
self.sliding_window = sliding_window
|
||||||
self._verify_args()
|
self._verify_args()
|
||||||
|
|
||||||
# Will be set after profiling.
|
# Will be set after profiling.
|
||||||
@@ -223,10 +291,12 @@ class ParallelConfig:
|
|||||||
pipeline_parallel_size: int,
|
pipeline_parallel_size: int,
|
||||||
tensor_parallel_size: int,
|
tensor_parallel_size: int,
|
||||||
worker_use_ray: bool,
|
worker_use_ray: bool,
|
||||||
|
max_parallel_loading_workers: Optional[int] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.pipeline_parallel_size = pipeline_parallel_size
|
self.pipeline_parallel_size = pipeline_parallel_size
|
||||||
self.tensor_parallel_size = tensor_parallel_size
|
self.tensor_parallel_size = tensor_parallel_size
|
||||||
self.worker_use_ray = worker_use_ray
|
self.worker_use_ray = worker_use_ray
|
||||||
|
self.max_parallel_loading_workers = max_parallel_loading_workers
|
||||||
|
|
||||||
self.world_size = pipeline_parallel_size * tensor_parallel_size
|
self.world_size = pipeline_parallel_size * tensor_parallel_size
|
||||||
if self.world_size > 1:
|
if self.world_size > 1:
|
||||||
@@ -249,13 +319,41 @@ class SchedulerConfig:
|
|||||||
iteration.
|
iteration.
|
||||||
max_model_len: Maximum length of a sequence (including prompt
|
max_model_len: Maximum length of a sequence (including prompt
|
||||||
and generated text).
|
and generated text).
|
||||||
|
max_paddings: Maximum number of paddings to be added to a batch.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, max_num_batched_tokens: int, max_num_seqs: int,
|
def __init__(
|
||||||
max_model_len: int) -> None:
|
self,
|
||||||
self.max_num_batched_tokens = max_num_batched_tokens
|
max_num_batched_tokens: Optional[int],
|
||||||
|
max_num_seqs: int,
|
||||||
|
max_model_len: int,
|
||||||
|
max_paddings: int,
|
||||||
|
) -> None:
|
||||||
|
if max_num_batched_tokens is not None:
|
||||||
|
self.max_num_batched_tokens = max_num_batched_tokens
|
||||||
|
else:
|
||||||
|
# If max_model_len is too short, use 2048 as the default value for
|
||||||
|
# higher throughput.
|
||||||
|
self.max_num_batched_tokens = max(max_model_len, 2048)
|
||||||
self.max_num_seqs = max_num_seqs
|
self.max_num_seqs = max_num_seqs
|
||||||
self.max_model_len = max_model_len
|
self.max_model_len = max_model_len
|
||||||
|
self.max_paddings = max_paddings
|
||||||
|
self._verify_args()
|
||||||
|
|
||||||
|
def _verify_args(self) -> None:
|
||||||
|
if self.max_num_batched_tokens < self.max_model_len:
|
||||||
|
raise ValueError(
|
||||||
|
f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
|
||||||
|
f"smaller than max_model_len ({self.max_model_len}). "
|
||||||
|
"This effectively limits the maximum sequence length to "
|
||||||
|
"max_num_batched_tokens and makes vLLM reject longer "
|
||||||
|
"sequences. Please increase max_num_batched_tokens or "
|
||||||
|
"decrease max_model_len.")
|
||||||
|
if self.max_num_batched_tokens < self.max_num_seqs:
|
||||||
|
raise ValueError(
|
||||||
|
f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
|
||||||
|
"be greater than or equal to max_num_seqs "
|
||||||
|
f"({self.max_num_seqs}).")
|
||||||
|
|
||||||
|
|
||||||
_STR_DTYPE_TO_TORCH_DTYPE = {
|
_STR_DTYPE_TO_TORCH_DTYPE = {
|
||||||
@@ -269,7 +367,7 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
|
|||||||
|
|
||||||
def _get_and_verify_dtype(
|
def _get_and_verify_dtype(
|
||||||
config: PretrainedConfig,
|
config: PretrainedConfig,
|
||||||
dtype: str,
|
dtype: Union[str, torch.dtype],
|
||||||
) -> torch.dtype:
|
) -> torch.dtype:
|
||||||
# NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
|
# NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
|
||||||
# because config.torch_dtype can be None.
|
# because config.torch_dtype can be None.
|
||||||
@@ -277,17 +375,23 @@ def _get_and_verify_dtype(
|
|||||||
if config_dtype is None:
|
if config_dtype is None:
|
||||||
config_dtype = torch.float32
|
config_dtype = torch.float32
|
||||||
|
|
||||||
dtype = dtype.lower()
|
if isinstance(dtype, str):
|
||||||
if dtype == "auto":
|
dtype = dtype.lower()
|
||||||
if config_dtype == torch.float32:
|
if dtype == "auto":
|
||||||
# Following the common practice, we use float16 for float32 models.
|
if config_dtype == torch.float32:
|
||||||
torch_dtype = torch.float16
|
# Following the common practice, we use float16 for float32
|
||||||
|
# models.
|
||||||
|
torch_dtype = torch.float16
|
||||||
|
else:
|
||||||
|
torch_dtype = config_dtype
|
||||||
else:
|
else:
|
||||||
torch_dtype = config_dtype
|
if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
|
||||||
|
raise ValueError(f"Unknown dtype: {dtype}")
|
||||||
|
torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
|
||||||
|
elif isinstance(dtype, torch.dtype):
|
||||||
|
torch_dtype = dtype
|
||||||
else:
|
else:
|
||||||
if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
|
raise ValueError(f"Unknown dtype: {dtype}")
|
||||||
raise ValueError(f"Unknown dtype: {dtype}")
|
|
||||||
torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
|
|
||||||
|
|
||||||
# Verify the dtype.
|
# Verify the dtype.
|
||||||
if torch_dtype != config_dtype:
|
if torch_dtype != config_dtype:
|
||||||
@@ -301,13 +405,62 @@ def _get_and_verify_dtype(
|
|||||||
# Casting between float16 and bfloat16 is allowed with a warning.
|
# Casting between float16 and bfloat16 is allowed with a warning.
|
||||||
logger.warning(f"Casting {config_dtype} to {torch_dtype}.")
|
logger.warning(f"Casting {config_dtype} to {torch_dtype}.")
|
||||||
|
|
||||||
# Check if the GPU supports the dtype.
|
|
||||||
if torch_dtype == torch.bfloat16:
|
|
||||||
compute_capability = torch.cuda.get_device_capability()
|
|
||||||
if compute_capability[0] < 8:
|
|
||||||
gpu_name = torch.cuda.get_device_name()
|
|
||||||
raise ValueError(
|
|
||||||
"Bfloat16 is only supported on GPUs with compute capability "
|
|
||||||
f"of at least 8.0. Your {gpu_name} GPU has compute capability "
|
|
||||||
f"{compute_capability[0]}.{compute_capability[1]}.")
|
|
||||||
return torch_dtype
|
return torch_dtype
|
||||||
|
|
||||||
|
|
||||||
|
def _get_and_verify_max_len(
|
||||||
|
hf_config: PretrainedConfig,
|
||||||
|
max_model_len: Optional[int],
|
||||||
|
) -> int:
|
||||||
|
"""Get and verify the model's maximum length."""
|
||||||
|
derived_max_model_len = float("inf")
|
||||||
|
possible_keys = [
|
||||||
|
# OPT
|
||||||
|
"max_position_embeddings",
|
||||||
|
# GPT-2
|
||||||
|
"n_positions",
|
||||||
|
# MPT
|
||||||
|
"max_seq_len",
|
||||||
|
# ChatGLM2
|
||||||
|
"seq_length",
|
||||||
|
# Others
|
||||||
|
"max_sequence_length",
|
||||||
|
"max_seq_length",
|
||||||
|
"seq_len",
|
||||||
|
]
|
||||||
|
for key in possible_keys:
|
||||||
|
max_len_key = getattr(hf_config, key, None)
|
||||||
|
if max_len_key is not None:
|
||||||
|
derived_max_model_len = min(derived_max_model_len, max_len_key)
|
||||||
|
if derived_max_model_len == float("inf"):
|
||||||
|
if max_model_len is not None:
|
||||||
|
# If max_model_len is specified, we use it.
|
||||||
|
return max_model_len
|
||||||
|
|
||||||
|
default_max_len = 2048
|
||||||
|
logger.warning(
|
||||||
|
"The model's config.json does not contain any of the following "
|
||||||
|
"keys to determine the original maximum length of the model: "
|
||||||
|
f"{possible_keys}. Assuming the model's maximum length is "
|
||||||
|
f"{default_max_len}.")
|
||||||
|
derived_max_model_len = default_max_len
|
||||||
|
|
||||||
|
rope_scaling = getattr(hf_config, "rope_scaling", None)
|
||||||
|
if rope_scaling is not None:
|
||||||
|
assert "factor" in rope_scaling
|
||||||
|
scaling_factor = rope_scaling["factor"]
|
||||||
|
if rope_scaling["type"] == "yarn":
|
||||||
|
derived_max_model_len = rope_scaling[
|
||||||
|
"original_max_position_embeddings"]
|
||||||
|
derived_max_model_len *= scaling_factor
|
||||||
|
|
||||||
|
if max_model_len is None:
|
||||||
|
max_model_len = derived_max_model_len
|
||||||
|
elif max_model_len > derived_max_model_len:
|
||||||
|
raise ValueError(
|
||||||
|
f"User-specified max_model_len ({max_model_len}) is greater than "
|
||||||
|
f"the derived max_model_len ({max_len_key}={derived_max_model_len}"
|
||||||
|
" in model's config.json). This may lead to incorrect model "
|
||||||
|
"outputs or CUDA errors. Make sure the value is correct and "
|
||||||
|
"within the model context size.")
|
||||||
|
return int(max_model_len)
|
||||||
|
|||||||
@@ -1,10 +1,14 @@
|
|||||||
"""A block manager that manages token blocks."""
|
"""A block manager that manages token blocks."""
|
||||||
|
import enum
|
||||||
from typing import Dict, List, Optional, Set, Tuple
|
from typing import Dict, List, Optional, Set, Tuple
|
||||||
|
|
||||||
from vllm.block import PhysicalTokenBlock
|
from vllm.block import PhysicalTokenBlock
|
||||||
from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
|
from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
|
||||||
from vllm.utils import Device
|
from vllm.utils import Device
|
||||||
|
|
||||||
|
# Mapping: logical block number -> physical block.
|
||||||
|
BlockTable = List[PhysicalTokenBlock]
|
||||||
|
|
||||||
|
|
||||||
class BlockAllocator:
|
class BlockAllocator:
|
||||||
"""Manages free physical token blocks for a device.
|
"""Manages free physical token blocks for a device.
|
||||||
@@ -25,7 +29,7 @@ class BlockAllocator:
|
|||||||
self.num_blocks = num_blocks
|
self.num_blocks = num_blocks
|
||||||
|
|
||||||
# Initialize the free blocks.
|
# Initialize the free blocks.
|
||||||
self.free_blocks: List[PhysicalTokenBlock] = []
|
self.free_blocks: BlockTable = []
|
||||||
for i in range(num_blocks):
|
for i in range(num_blocks):
|
||||||
block = PhysicalTokenBlock(device=device,
|
block = PhysicalTokenBlock(device=device,
|
||||||
block_number=i,
|
block_number=i,
|
||||||
@@ -50,8 +54,18 @@ class BlockAllocator:
|
|||||||
return len(self.free_blocks)
|
return len(self.free_blocks)
|
||||||
|
|
||||||
|
|
||||||
# Mapping: logical block number -> physical block.
|
class AllocStatus(enum.Enum):
|
||||||
BlockTable = List[PhysicalTokenBlock]
|
"""Result for BlockSpaceManager.can_allocate
|
||||||
|
|
||||||
|
1. Ok: seq_group can be allocated now.
|
||||||
|
2. Later: seq_group cannot be allocated.
|
||||||
|
The capacity of allocator is larger than seq_group required.
|
||||||
|
3. Never: seq_group can never be allocated.
|
||||||
|
The seq_group is too large to allocated in GPU.
|
||||||
|
"""
|
||||||
|
OK = enum.auto()
|
||||||
|
LATER = enum.auto()
|
||||||
|
NEVER = enum.auto()
|
||||||
|
|
||||||
|
|
||||||
class BlockSpaceManager:
|
class BlockSpaceManager:
|
||||||
@@ -63,10 +77,18 @@ class BlockSpaceManager:
|
|||||||
num_gpu_blocks: int,
|
num_gpu_blocks: int,
|
||||||
num_cpu_blocks: int,
|
num_cpu_blocks: int,
|
||||||
watermark: float = 0.01,
|
watermark: float = 0.01,
|
||||||
|
sliding_window: Optional[int] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.block_size = block_size
|
self.block_size = block_size
|
||||||
self.num_total_gpu_blocks = num_gpu_blocks
|
self.num_total_gpu_blocks = num_gpu_blocks
|
||||||
self.num_total_cpu_blocks = num_cpu_blocks
|
self.num_total_cpu_blocks = num_cpu_blocks
|
||||||
|
|
||||||
|
self.block_sliding_window = None
|
||||||
|
if sliding_window is not None:
|
||||||
|
assert sliding_window % block_size == 0, (sliding_window,
|
||||||
|
block_size)
|
||||||
|
self.block_sliding_window = sliding_window // block_size
|
||||||
|
|
||||||
self.watermark = watermark
|
self.watermark = watermark
|
||||||
assert watermark >= 0.0
|
assert watermark >= 0.0
|
||||||
|
|
||||||
@@ -78,15 +100,24 @@ class BlockSpaceManager:
|
|||||||
# Mapping: seq_id -> BlockTable.
|
# Mapping: seq_id -> BlockTable.
|
||||||
self.block_tables: Dict[int, BlockTable] = {}
|
self.block_tables: Dict[int, BlockTable] = {}
|
||||||
|
|
||||||
def can_allocate(self, seq_group: SequenceGroup) -> bool:
|
def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
|
||||||
# FIXME(woosuk): Here we assume that all sequences in the group share
|
# FIXME(woosuk): Here we assume that all sequences in the group share
|
||||||
# the same prompt. This may not be true for preempted sequences.
|
# the same prompt. This may not be true for preempted sequences.
|
||||||
seq = seq_group.get_seqs()[0]
|
seq = seq_group.get_seqs()[0]
|
||||||
num_required_blocks = len(seq.logical_token_blocks)
|
num_required_blocks = len(seq.logical_token_blocks)
|
||||||
|
if self.block_sliding_window is not None:
|
||||||
|
num_required_blocks = min(num_required_blocks,
|
||||||
|
self.block_sliding_window)
|
||||||
num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks()
|
num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks()
|
||||||
|
|
||||||
# Use watermark to avoid frequent cache eviction.
|
# Use watermark to avoid frequent cache eviction.
|
||||||
return (num_free_gpu_blocks - num_required_blocks >=
|
if (self.num_total_gpu_blocks - num_required_blocks <
|
||||||
self.watermark_blocks)
|
self.watermark_blocks):
|
||||||
|
return AllocStatus.NEVER
|
||||||
|
if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks:
|
||||||
|
return AllocStatus.OK
|
||||||
|
else:
|
||||||
|
return AllocStatus.LATER
|
||||||
|
|
||||||
def allocate(self, seq_group: SequenceGroup) -> None:
|
def allocate(self, seq_group: SequenceGroup) -> None:
|
||||||
# NOTE: Here we assume that all sequences in the group have the same
|
# NOTE: Here we assume that all sequences in the group have the same
|
||||||
@@ -95,8 +126,12 @@ class BlockSpaceManager:
|
|||||||
|
|
||||||
# Allocate new physical token blocks that will store the prompt tokens.
|
# Allocate new physical token blocks that will store the prompt tokens.
|
||||||
block_table: BlockTable = []
|
block_table: BlockTable = []
|
||||||
for _ in range(len(seq.logical_token_blocks)):
|
for logical_idx in range(len(seq.logical_token_blocks)):
|
||||||
block = self.gpu_allocator.allocate()
|
if (self.block_sliding_window is not None
|
||||||
|
and logical_idx >= self.block_sliding_window):
|
||||||
|
block = block_table[logical_idx % self.block_sliding_window]
|
||||||
|
else:
|
||||||
|
block = self.gpu_allocator.allocate()
|
||||||
# Set the reference counts of the token blocks.
|
# Set the reference counts of the token blocks.
|
||||||
block.ref_count = seq_group.num_seqs()
|
block.ref_count = seq_group.num_seqs()
|
||||||
block_table.append(block)
|
block_table.append(block)
|
||||||
@@ -118,11 +153,17 @@ class BlockSpaceManager:
|
|||||||
block_table = self.block_tables[seq.seq_id]
|
block_table = self.block_tables[seq.seq_id]
|
||||||
|
|
||||||
if len(block_table) < len(logical_blocks):
|
if len(block_table) < len(logical_blocks):
|
||||||
# The sequence has a new logical block.
|
if (self.block_sliding_window
|
||||||
# Allocate a new physical block.
|
and len(block_table) >= self.block_sliding_window):
|
||||||
block = self.gpu_allocator.allocate()
|
# re-use a block
|
||||||
block_table.append(block)
|
block_table.append(block_table[len(block_table) %
|
||||||
return None
|
self.block_sliding_window])
|
||||||
|
else:
|
||||||
|
# The sequence has a new logical block.
|
||||||
|
# Allocate a new physical block.
|
||||||
|
block = self.gpu_allocator.allocate()
|
||||||
|
block_table.append(block)
|
||||||
|
return None
|
||||||
|
|
||||||
# We want to append the token to the last physical block.
|
# We want to append the token to the last physical block.
|
||||||
last_block = block_table[-1]
|
last_block = block_table[-1]
|
||||||
@@ -154,9 +195,7 @@ class BlockSpaceManager:
|
|||||||
for seq in seq_group.get_seqs():
|
for seq in seq_group.get_seqs():
|
||||||
if seq.is_finished():
|
if seq.is_finished():
|
||||||
continue
|
continue
|
||||||
block_table = self.block_tables[seq.seq_id]
|
blocks.update(self.block_tables[seq.seq_id])
|
||||||
for block in block_table:
|
|
||||||
blocks.add(block)
|
|
||||||
return list(blocks)
|
return list(blocks)
|
||||||
|
|
||||||
def can_swap_in(self, seq_group: SequenceGroup) -> bool:
|
def can_swap_in(self, seq_group: SequenceGroup) -> bool:
|
||||||
@@ -224,7 +263,7 @@ class BlockSpaceManager:
|
|||||||
return block_number_mapping
|
return block_number_mapping
|
||||||
|
|
||||||
def _free_block_table(self, block_table: BlockTable) -> None:
|
def _free_block_table(self, block_table: BlockTable) -> None:
|
||||||
for block in block_table:
|
for block in set(block_table):
|
||||||
if block.device == Device.GPU:
|
if block.device == Device.GPU:
|
||||||
self.gpu_allocator.free(block)
|
self.gpu_allocator.free(block)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ import time
|
|||||||
from typing import Dict, Iterable, List, Optional, Tuple, Union
|
from typing import Dict, Iterable, List, Optional, Tuple, Union
|
||||||
|
|
||||||
from vllm.config import CacheConfig, SchedulerConfig
|
from vllm.config import CacheConfig, SchedulerConfig
|
||||||
from vllm.core.block_manager import BlockSpaceManager
|
from vllm.core.block_manager import AllocStatus, BlockSpaceManager
|
||||||
from vllm.core.policy import PolicyFactory
|
from vllm.core.policy import PolicyFactory
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
|
from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
|
||||||
@@ -73,7 +73,7 @@ class Scheduler:
|
|||||||
block_size=self.cache_config.block_size,
|
block_size=self.cache_config.block_size,
|
||||||
num_gpu_blocks=self.cache_config.num_gpu_blocks,
|
num_gpu_blocks=self.cache_config.num_gpu_blocks,
|
||||||
num_cpu_blocks=self.cache_config.num_cpu_blocks,
|
num_cpu_blocks=self.cache_config.num_cpu_blocks,
|
||||||
)
|
sliding_window=self.cache_config.sliding_window)
|
||||||
|
|
||||||
# TODO(zhuohan): Use deque instead of list for better performance.
|
# TODO(zhuohan): Use deque instead of list for better performance.
|
||||||
# Sequence groups in the WAITING state.
|
# Sequence groups in the WAITING state.
|
||||||
@@ -121,7 +121,7 @@ class Scheduler:
|
|||||||
blocks_to_copy: Dict[int, List[int]] = {}
|
blocks_to_copy: Dict[int, List[int]] = {}
|
||||||
|
|
||||||
# Fix the current time.
|
# Fix the current time.
|
||||||
now = time.time()
|
now = time.monotonic()
|
||||||
|
|
||||||
# Join waiting sequences if possible.
|
# Join waiting sequences if possible.
|
||||||
if not self.swapped:
|
if not self.swapped:
|
||||||
@@ -131,7 +131,8 @@ class Scheduler:
|
|||||||
# requests in the generation phase.
|
# requests in the generation phase.
|
||||||
num_curr_seqs = sum(seq_group.get_max_num_running_seqs()
|
num_curr_seqs = sum(seq_group.get_max_num_running_seqs()
|
||||||
for seq_group in self.running)
|
for seq_group in self.running)
|
||||||
num_batched_tokens = 0
|
seq_lens: List[int] = []
|
||||||
|
|
||||||
# Optimization: We do not sort the waiting queue since the preempted
|
# Optimization: We do not sort the waiting queue since the preempted
|
||||||
# sequence groups are added to the front and the new sequence groups
|
# sequence groups are added to the front and the new sequence groups
|
||||||
# are added to the back.
|
# are added to the back.
|
||||||
@@ -153,11 +154,23 @@ class Scheduler:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# If the sequence group cannot be allocated, stop.
|
# If the sequence group cannot be allocated, stop.
|
||||||
if not self.block_manager.can_allocate(seq_group):
|
can_allocate = self.block_manager.can_allocate(seq_group)
|
||||||
|
if can_allocate == AllocStatus.LATER:
|
||||||
break
|
break
|
||||||
|
elif can_allocate == AllocStatus.NEVER:
|
||||||
|
logger.warning(
|
||||||
|
f"Input prompt ({num_prompt_tokens} tokens) is too long"
|
||||||
|
f" and exceeds the capacity of block_manager")
|
||||||
|
for seq in seq_group.get_seqs():
|
||||||
|
seq.status = SequenceStatus.FINISHED_IGNORED
|
||||||
|
ignored_seq_groups.append(seq_group)
|
||||||
|
self.waiting.pop(0)
|
||||||
|
continue
|
||||||
|
|
||||||
# If the number of batched tokens exceeds the limit, stop.
|
# If the number of batched tokens exceeds the limit, stop.
|
||||||
if (num_batched_tokens + num_prompt_tokens >
|
new_seq_lens = seq_lens + [num_prompt_tokens]
|
||||||
|
num_batched_tokens = len(new_seq_lens) * max(new_seq_lens)
|
||||||
|
if (num_batched_tokens >
|
||||||
self.scheduler_config.max_num_batched_tokens):
|
self.scheduler_config.max_num_batched_tokens):
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -168,18 +181,23 @@ class Scheduler:
|
|||||||
self.scheduler_config.max_num_seqs):
|
self.scheduler_config.max_num_seqs):
|
||||||
break
|
break
|
||||||
|
|
||||||
|
num_paddings = num_batched_tokens - sum(new_seq_lens)
|
||||||
|
if num_paddings > self.scheduler_config.max_paddings:
|
||||||
|
break
|
||||||
|
seq_lens = new_seq_lens
|
||||||
|
|
||||||
seq_group = self.waiting.pop(0)
|
seq_group = self.waiting.pop(0)
|
||||||
self._allocate(seq_group)
|
self._allocate(seq_group)
|
||||||
self.running.append(seq_group)
|
self.running.append(seq_group)
|
||||||
num_batched_tokens += num_prompt_tokens
|
|
||||||
num_curr_seqs += num_new_seqs
|
num_curr_seqs += num_new_seqs
|
||||||
scheduled.append(seq_group)
|
scheduled.append(seq_group)
|
||||||
|
|
||||||
if scheduled:
|
if scheduled or ignored_seq_groups:
|
||||||
scheduler_outputs = SchedulerOutputs(
|
scheduler_outputs = SchedulerOutputs(
|
||||||
scheduled_seq_groups=scheduled,
|
scheduled_seq_groups=scheduled,
|
||||||
prompt_run=True,
|
prompt_run=True,
|
||||||
num_batched_tokens=num_batched_tokens,
|
num_batched_tokens=len(seq_lens) *
|
||||||
|
max(seq_lens) if seq_lens else 0,
|
||||||
blocks_to_swap_in=blocks_to_swap_in,
|
blocks_to_swap_in=blocks_to_swap_in,
|
||||||
blocks_to_swap_out=blocks_to_swap_out,
|
blocks_to_swap_out=blocks_to_swap_out,
|
||||||
blocks_to_copy=blocks_to_copy,
|
blocks_to_copy=blocks_to_copy,
|
||||||
@@ -268,7 +286,7 @@ class Scheduler:
|
|||||||
# Create input data structures.
|
# Create input data structures.
|
||||||
seq_group_metadata_list: List[SequenceGroupMetadata] = []
|
seq_group_metadata_list: List[SequenceGroupMetadata] = []
|
||||||
for seq_group in scheduler_outputs.scheduled_seq_groups:
|
for seq_group in scheduler_outputs.scheduled_seq_groups:
|
||||||
seq_data: Dict[int, List[SequenceData]] = {}
|
seq_data: Dict[int, SequenceData] = {}
|
||||||
block_tables: Dict[int, List[int]] = {}
|
block_tables: Dict[int, List[int]] = {}
|
||||||
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
|
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
|
||||||
seq_id = seq.seq_id
|
seq_id = seq.seq_id
|
||||||
@@ -343,7 +361,7 @@ class Scheduler:
|
|||||||
elif preemption_mode == PreemptionMode.SWAP:
|
elif preemption_mode == PreemptionMode.SWAP:
|
||||||
self._preempt_by_swap(seq_group, blocks_to_swap_out)
|
self._preempt_by_swap(seq_group, blocks_to_swap_out)
|
||||||
else:
|
else:
|
||||||
assert False, "Invalid preemption mode."
|
raise AssertionError("Invalid preemption mode.")
|
||||||
|
|
||||||
def _preempt_by_recompute(
|
def _preempt_by_recompute(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -18,25 +18,34 @@ class EngineArgs:
|
|||||||
load_format: str = 'auto'
|
load_format: str = 'auto'
|
||||||
dtype: str = 'auto'
|
dtype: str = 'auto'
|
||||||
seed: int = 0
|
seed: int = 0
|
||||||
|
max_model_len: Optional[int] = None
|
||||||
worker_use_ray: bool = False
|
worker_use_ray: bool = False
|
||||||
pipeline_parallel_size: int = 1
|
pipeline_parallel_size: int = 1
|
||||||
tensor_parallel_size: int = 1
|
tensor_parallel_size: int = 1
|
||||||
|
max_parallel_loading_workers: Optional[int] = None
|
||||||
block_size: int = 16
|
block_size: int = 16
|
||||||
swap_space: int = 4 # GiB
|
swap_space: int = 4 # GiB
|
||||||
gpu_memory_utilization: float = 0.90
|
gpu_memory_utilization: float = 0.90
|
||||||
max_num_batched_tokens: int = 2560
|
max_num_batched_tokens: Optional[int] = None
|
||||||
max_num_seqs: int = 256
|
max_num_seqs: int = 256
|
||||||
|
max_paddings: int = 256
|
||||||
disable_log_stats: bool = False
|
disable_log_stats: bool = False
|
||||||
|
revision: Optional[str] = None
|
||||||
|
tokenizer_revision: Optional[str] = None
|
||||||
|
quantization: Optional[str] = None
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
if self.tokenizer is None:
|
if self.tokenizer is None:
|
||||||
self.tokenizer = self.model
|
self.tokenizer = self.model
|
||||||
self.max_num_seqs = min(self.max_num_seqs, self.max_num_batched_tokens)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def add_cli_args(
|
def add_cli_args(
|
||||||
parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
|
parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
|
||||||
"""Shared CLI arguments for vLLM engine."""
|
"""Shared CLI arguments for vLLM engine."""
|
||||||
|
|
||||||
|
# NOTE: If you update any of the arguments below, please also
|
||||||
|
# make sure to update docs/source/models/engine_args.rst
|
||||||
|
|
||||||
# Model arguments
|
# Model arguments
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--model',
|
'--model',
|
||||||
@@ -48,6 +57,20 @@ class EngineArgs:
|
|||||||
type=str,
|
type=str,
|
||||||
default=EngineArgs.tokenizer,
|
default=EngineArgs.tokenizer,
|
||||||
help='name or path of the huggingface tokenizer to use')
|
help='name or path of the huggingface tokenizer to use')
|
||||||
|
parser.add_argument(
|
||||||
|
'--revision',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help='the specific model version to use. It can be a branch '
|
||||||
|
'name, a tag name, or a commit id. If unspecified, will use '
|
||||||
|
'the default version.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--tokenizer-revision',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help='the specific tokenizer version to use. It can be a branch '
|
||||||
|
'name, a tag name, or a commit id. If unspecified, will use '
|
||||||
|
'the default version.')
|
||||||
parser.add_argument('--tokenizer-mode',
|
parser.add_argument('--tokenizer-mode',
|
||||||
type=str,
|
type=str,
|
||||||
default=EngineArgs.tokenizer_mode,
|
default=EngineArgs.tokenizer_mode,
|
||||||
@@ -79,16 +102,22 @@ class EngineArgs:
|
|||||||
'a numpy cache to speed up the loading. '
|
'a numpy cache to speed up the loading. '
|
||||||
'"dummy" will initialize the weights with random values, '
|
'"dummy" will initialize the weights with random values, '
|
||||||
'which is mainly for profiling.')
|
'which is mainly for profiling.')
|
||||||
# TODO(woosuk): Support FP32.
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--dtype',
|
'--dtype',
|
||||||
type=str,
|
type=str,
|
||||||
default=EngineArgs.dtype,
|
default=EngineArgs.dtype,
|
||||||
choices=['auto', 'half', 'bfloat16', 'float'],
|
choices=[
|
||||||
|
'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'
|
||||||
|
],
|
||||||
help='data type for model weights and activations. '
|
help='data type for model weights and activations. '
|
||||||
'The "auto" option will use FP16 precision '
|
'The "auto" option will use FP16 precision '
|
||||||
'for FP32 and FP16 models, and BF16 precision '
|
'for FP32 and FP16 models, and BF16 precision '
|
||||||
'for BF16 models.')
|
'for BF16 models.')
|
||||||
|
parser.add_argument('--max-model-len',
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help='model context length. If unspecified, '
|
||||||
|
'will be automatically derived from the model.')
|
||||||
# Parallel arguments
|
# Parallel arguments
|
||||||
parser.add_argument('--worker-use-ray',
|
parser.add_argument('--worker-use-ray',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
@@ -104,6 +133,12 @@ class EngineArgs:
|
|||||||
type=int,
|
type=int,
|
||||||
default=EngineArgs.tensor_parallel_size,
|
default=EngineArgs.tensor_parallel_size,
|
||||||
help='number of tensor parallel replicas')
|
help='number of tensor parallel replicas')
|
||||||
|
parser.add_argument(
|
||||||
|
'--max-parallel-loading-workers',
|
||||||
|
type=int,
|
||||||
|
help='load model sequentially in multiple batches, '
|
||||||
|
'to avoid RAM OOM when using tensor '
|
||||||
|
'parallel and large models')
|
||||||
# KV cache arguments
|
# KV cache arguments
|
||||||
parser.add_argument('--block-size',
|
parser.add_argument('--block-size',
|
||||||
type=int,
|
type=int,
|
||||||
@@ -133,9 +168,20 @@ class EngineArgs:
|
|||||||
type=int,
|
type=int,
|
||||||
default=EngineArgs.max_num_seqs,
|
default=EngineArgs.max_num_seqs,
|
||||||
help='maximum number of sequences per iteration')
|
help='maximum number of sequences per iteration')
|
||||||
|
parser.add_argument('--max-paddings',
|
||||||
|
type=int,
|
||||||
|
default=EngineArgs.max_paddings,
|
||||||
|
help='maximum number of paddings in a batch')
|
||||||
parser.add_argument('--disable-log-stats',
|
parser.add_argument('--disable-log-stats',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help='disable logging statistics')
|
help='disable logging statistics')
|
||||||
|
# Quantization settings.
|
||||||
|
parser.add_argument('--quantization',
|
||||||
|
'-q',
|
||||||
|
type=str,
|
||||||
|
choices=['awq', 'squeezellm', None],
|
||||||
|
default=None,
|
||||||
|
help='Method used to quantize the weights')
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -149,20 +195,24 @@ class EngineArgs:
|
|||||||
def create_engine_configs(
|
def create_engine_configs(
|
||||||
self,
|
self,
|
||||||
) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig]:
|
) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig]:
|
||||||
# Initialize the configs.
|
|
||||||
model_config = ModelConfig(self.model, self.tokenizer,
|
model_config = ModelConfig(self.model, self.tokenizer,
|
||||||
self.tokenizer_mode, self.trust_remote_code,
|
self.tokenizer_mode, self.trust_remote_code,
|
||||||
self.download_dir, self.load_format,
|
self.download_dir, self.load_format,
|
||||||
self.dtype, self.seed)
|
self.dtype, self.seed, self.revision,
|
||||||
|
self.tokenizer_revision, self.max_model_len,
|
||||||
|
self.quantization)
|
||||||
cache_config = CacheConfig(self.block_size,
|
cache_config = CacheConfig(self.block_size,
|
||||||
self.gpu_memory_utilization,
|
self.gpu_memory_utilization,
|
||||||
self.swap_space)
|
self.swap_space,
|
||||||
|
model_config.get_sliding_window())
|
||||||
parallel_config = ParallelConfig(self.pipeline_parallel_size,
|
parallel_config = ParallelConfig(self.pipeline_parallel_size,
|
||||||
self.tensor_parallel_size,
|
self.tensor_parallel_size,
|
||||||
self.worker_use_ray)
|
self.worker_use_ray,
|
||||||
|
self.max_parallel_loading_workers)
|
||||||
scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
|
scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
|
||||||
self.max_num_seqs,
|
self.max_num_seqs,
|
||||||
model_config.get_max_model_len())
|
model_config.max_model_len,
|
||||||
|
self.max_paddings)
|
||||||
return model_config, cache_config, parallel_config, scheduler_config
|
return model_config, cache_config, parallel_config, scheduler_config
|
||||||
|
|
||||||
|
|
||||||
@@ -171,6 +221,7 @@ class AsyncEngineArgs(EngineArgs):
|
|||||||
"""Arguments for asynchronous vLLM engine."""
|
"""Arguments for asynchronous vLLM engine."""
|
||||||
engine_use_ray: bool = False
|
engine_use_ray: bool = False
|
||||||
disable_log_requests: bool = False
|
disable_log_requests: bool = False
|
||||||
|
max_log_len: Optional[int] = None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def add_cli_args(
|
def add_cli_args(
|
||||||
@@ -183,4 +234,10 @@ class AsyncEngineArgs(EngineArgs):
|
|||||||
parser.add_argument('--disable-log-requests',
|
parser.add_argument('--disable-log-requests',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help='disable logging requests')
|
help='disable logging requests')
|
||||||
|
parser.add_argument('--max-log-len',
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help='max number of prompt characters or prompt '
|
||||||
|
'ID numbers being printed in log. '
|
||||||
|
'Default: unlimited.')
|
||||||
return parser
|
return parser
|
||||||
|
|||||||
@@ -1,7 +1,8 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import time
|
import time
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type, Union
|
from typing import (Any, Dict, Iterable, List, Optional, Set, Tuple, Type,
|
||||||
|
Union)
|
||||||
|
|
||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||||
@@ -78,14 +79,24 @@ class RequestTracker:
|
|||||||
self._finished_requests: asyncio.Queue[str] = asyncio.Queue()
|
self._finished_requests: asyncio.Queue[str] = asyncio.Queue()
|
||||||
self._new_requests: asyncio.Queue[Tuple[AsyncStream,
|
self._new_requests: asyncio.Queue[Tuple[AsyncStream,
|
||||||
dict]] = asyncio.Queue()
|
dict]] = asyncio.Queue()
|
||||||
|
self.new_requests_event = None
|
||||||
|
|
||||||
def __contains__(self, item):
|
def __contains__(self, item):
|
||||||
return item in self._request_streams
|
return item in self._request_streams
|
||||||
|
|
||||||
def propagate_exception(self, exc: Exception) -> None:
|
def init_event(self):
|
||||||
"""Propagate an exception to all request streams."""
|
self.new_requests_event = asyncio.Event()
|
||||||
for stream in self._request_streams.values():
|
|
||||||
stream.put(exc)
|
def propagate_exception(self,
|
||||||
|
exc: Exception,
|
||||||
|
request_id: Optional[str] = None) -> None:
|
||||||
|
"""Propagate an exception to request streams
|
||||||
|
(all if request_id is None)."""
|
||||||
|
if request_id is not None:
|
||||||
|
self._request_streams[request_id].put(exc)
|
||||||
|
else:
|
||||||
|
for stream in self._request_streams.values():
|
||||||
|
stream.put(exc)
|
||||||
|
|
||||||
def process_request_output(self,
|
def process_request_output(self,
|
||||||
request_output: RequestOutput,
|
request_output: RequestOutput,
|
||||||
@@ -112,6 +123,9 @@ class RequestTracker:
|
|||||||
"request_id": request_id,
|
"request_id": request_id,
|
||||||
**engine_add_request_kwargs
|
**engine_add_request_kwargs
|
||||||
}))
|
}))
|
||||||
|
|
||||||
|
self.new_requests_event.set()
|
||||||
|
|
||||||
return stream
|
return stream
|
||||||
|
|
||||||
def abort_request(self, request_id: str, *, verbose: bool = False) -> None:
|
def abort_request(self, request_id: str, *, verbose: bool = False) -> None:
|
||||||
@@ -128,10 +142,10 @@ class RequestTracker:
|
|||||||
|
|
||||||
self._request_streams[request_id].finish()
|
self._request_streams[request_id].finish()
|
||||||
|
|
||||||
def get_new_and_finished_requests(self) -> Tuple[List[dict], Set[str]]:
|
def get_new_and_finished_requests(self) -> Tuple[List[Dict], Set[str]]:
|
||||||
"""Get the new requests and finished requests to be
|
"""Get the new requests and finished requests to be
|
||||||
sent to the engine."""
|
sent to the engine."""
|
||||||
new_requests: List[dict] = []
|
new_requests: List[Dict] = []
|
||||||
finished_requests: Set[str] = set()
|
finished_requests: Set[str] = set()
|
||||||
|
|
||||||
while not self._finished_requests.empty():
|
while not self._finished_requests.empty():
|
||||||
@@ -148,8 +162,13 @@ class RequestTracker:
|
|||||||
self._request_streams[stream.request_id] = stream
|
self._request_streams[stream.request_id] = stream
|
||||||
new_requests.append(new_request)
|
new_requests.append(new_request)
|
||||||
|
|
||||||
|
self.new_requests_event.clear()
|
||||||
|
|
||||||
return new_requests, finished_requests
|
return new_requests, finished_requests
|
||||||
|
|
||||||
|
async def wait_for_new_requests(self):
|
||||||
|
await self.new_requests_event.wait()
|
||||||
|
|
||||||
|
|
||||||
class _AsyncLLMEngine(LLMEngine):
|
class _AsyncLLMEngine(LLMEngine):
|
||||||
"""Extension of LLMEngine to add async methods."""
|
"""Extension of LLMEngine to add async methods."""
|
||||||
@@ -164,10 +183,9 @@ class _AsyncLLMEngine(LLMEngine):
|
|||||||
and updates the scheduler with the model outputs. Finally, it decodes
|
and updates the scheduler with the model outputs. Finally, it decodes
|
||||||
the sequences and returns the newly generated results.
|
the sequences and returns the newly generated results.
|
||||||
"""
|
"""
|
||||||
(seq_group_metadata_list, scheduler_outputs,
|
seq_group_metadata_list, scheduler_outputs, ignored = self._schedule()
|
||||||
early_return) = self._schedule()
|
if scheduler_outputs.is_empty():
|
||||||
if early_return is not None:
|
return ignored
|
||||||
return early_return
|
|
||||||
|
|
||||||
# Execute the model.
|
# Execute the model.
|
||||||
output = await self._run_workers_async(
|
output = await self._run_workers_async(
|
||||||
@@ -178,7 +196,7 @@ class _AsyncLLMEngine(LLMEngine):
|
|||||||
blocks_to_copy=scheduler_outputs.blocks_to_copy,
|
blocks_to_copy=scheduler_outputs.blocks_to_copy,
|
||||||
)
|
)
|
||||||
|
|
||||||
return self._process_model_outputs(output, scheduler_outputs)
|
return self._process_model_outputs(output, scheduler_outputs) + ignored
|
||||||
|
|
||||||
async def _run_workers_async(
|
async def _run_workers_async(
|
||||||
self,
|
self,
|
||||||
@@ -188,18 +206,17 @@ class _AsyncLLMEngine(LLMEngine):
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
) -> Any:
|
) -> Any:
|
||||||
"""Runs the given method on all workers."""
|
"""Runs the given method on all workers."""
|
||||||
all_outputs = []
|
coros = []
|
||||||
for worker in self.workers:
|
for worker in self.workers:
|
||||||
if self.parallel_config.worker_use_ray:
|
if self.parallel_config.worker_use_ray:
|
||||||
executor = partial(worker.execute_method.remote, method)
|
coros.append(
|
||||||
|
worker.execute_method.remote(method, *args, **kwargs))
|
||||||
else:
|
else:
|
||||||
executor = getattr(worker, method)
|
executor = getattr(worker, method)
|
||||||
|
coros.append(asyncio.get_event_loop().run_in_executor(
|
||||||
|
None, partial(executor, *args, **kwargs)))
|
||||||
|
|
||||||
output = executor(*args, **kwargs)
|
all_outputs = await asyncio.gather(*coros)
|
||||||
all_outputs.append(output)
|
|
||||||
|
|
||||||
if self.parallel_config.worker_use_ray:
|
|
||||||
all_outputs = await asyncio.gather(*all_outputs)
|
|
||||||
|
|
||||||
if get_all_outputs:
|
if get_all_outputs:
|
||||||
return all_outputs
|
return all_outputs
|
||||||
@@ -230,6 +247,8 @@ class AsyncLLMEngine:
|
|||||||
async frontend will be executed in a separate process as the
|
async frontend will be executed in a separate process as the
|
||||||
model workers.
|
model workers.
|
||||||
log_requests: Whether to log the requests.
|
log_requests: Whether to log the requests.
|
||||||
|
start_engine_loop: If True, the background task to run the engine
|
||||||
|
will be automatically started in the generate call.
|
||||||
*args, *kwargs: Arguments for LLMEngine.
|
*args, *kwargs: Arguments for LLMEngine.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -240,17 +259,22 @@ class AsyncLLMEngine:
|
|||||||
engine_use_ray: bool,
|
engine_use_ray: bool,
|
||||||
*args,
|
*args,
|
||||||
log_requests: bool = True,
|
log_requests: bool = True,
|
||||||
start_engine_loop: bool = False,
|
max_log_len: Optional[int] = None,
|
||||||
|
start_engine_loop: bool = True,
|
||||||
**kwargs) -> None:
|
**kwargs) -> None:
|
||||||
self.worker_use_ray = worker_use_ray
|
self.worker_use_ray = worker_use_ray
|
||||||
self.engine_use_ray = engine_use_ray
|
self.engine_use_ray = engine_use_ray
|
||||||
self.log_requests = log_requests
|
self.log_requests = log_requests
|
||||||
|
self.max_log_len = max_log_len
|
||||||
self.engine = self._init_engine(*args, **kwargs)
|
self.engine = self._init_engine(*args, **kwargs)
|
||||||
|
|
||||||
self.request_tracker: RequestTracker = RequestTracker()
|
|
||||||
self.background_loop = None
|
self.background_loop = None
|
||||||
if start_engine_loop:
|
# We need to keep a reference to unshielded
|
||||||
self.start_background_loop()
|
# task as well to prevent it from being garbage
|
||||||
|
# collected
|
||||||
|
self._background_loop_unshielded = None
|
||||||
|
self.start_engine_loop = start_engine_loop
|
||||||
|
self._request_tracker = RequestTracker()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_running(self) -> bool:
|
def is_running(self) -> bool:
|
||||||
@@ -261,11 +285,14 @@ class AsyncLLMEngine:
|
|||||||
"""Start the background loop."""
|
"""Start the background loop."""
|
||||||
if self.is_running:
|
if self.is_running:
|
||||||
raise RuntimeError("Background loop is already running.")
|
raise RuntimeError("Background loop is already running.")
|
||||||
self.background_loop = asyncio.get_event_loop().create_task(
|
self._request_tracker.init_event()
|
||||||
self.run_engine_loop())
|
|
||||||
self.background_loop.add_done_callback(
|
self._background_loop_unshielded = asyncio.get_event_loop(
|
||||||
|
).create_task(self.run_engine_loop())
|
||||||
|
self._background_loop_unshielded.add_done_callback(
|
||||||
partial(_raise_exception_on_finish,
|
partial(_raise_exception_on_finish,
|
||||||
request_tracker=self.request_tracker))
|
request_tracker=self._request_tracker))
|
||||||
|
self.background_loop = asyncio.shield(self._background_loop_unshielded)
|
||||||
|
|
||||||
def _init_engine(self, *args,
|
def _init_engine(self, *args,
|
||||||
**kwargs) -> Union[_AsyncLLMEngine, "ray.ObjectRef"]:
|
**kwargs) -> Union[_AsyncLLMEngine, "ray.ObjectRef"]:
|
||||||
@@ -274,14 +301,25 @@ class AsyncLLMEngine:
|
|||||||
elif self.worker_use_ray:
|
elif self.worker_use_ray:
|
||||||
engine_class = ray.remote(num_cpus=0)(self._engine_class).remote
|
engine_class = ray.remote(num_cpus=0)(self._engine_class).remote
|
||||||
else:
|
else:
|
||||||
engine_class = ray.remote(num_gpus=1)(self._engine_class).remote
|
# FIXME(woosuk): This is a bit hacky. Be careful when changing the
|
||||||
|
# order of the arguments.
|
||||||
|
cache_config = args[1]
|
||||||
|
parallel_config = args[2]
|
||||||
|
if parallel_config.tensor_parallel_size == 1:
|
||||||
|
num_gpus = cache_config.gpu_memory_utilization
|
||||||
|
else:
|
||||||
|
num_gpus = 1
|
||||||
|
engine_class = ray.remote(num_gpus=num_gpus)(
|
||||||
|
self._engine_class).remote
|
||||||
return engine_class(*args, **kwargs)
|
return engine_class(*args, **kwargs)
|
||||||
|
|
||||||
async def engine_step(self):
|
async def engine_step(self) -> bool:
|
||||||
"""Kick the engine to process the waiting requests."""
|
"""Kick the engine to process the waiting requests.
|
||||||
|
|
||||||
|
Returns True if there are in-progress requests."""
|
||||||
|
|
||||||
new_requests, finished_requests = (
|
new_requests, finished_requests = (
|
||||||
self.request_tracker.get_new_and_finished_requests())
|
self._request_tracker.get_new_and_finished_requests())
|
||||||
|
|
||||||
for new_request in new_requests:
|
for new_request in new_requests:
|
||||||
# Add the request into the vLLM engine's waiting queue.
|
# Add the request into the vLLM engine's waiting queue.
|
||||||
@@ -301,9 +339,11 @@ class AsyncLLMEngine:
|
|||||||
|
|
||||||
# Put the outputs into the corresponding streams.
|
# Put the outputs into the corresponding streams.
|
||||||
for request_output in request_outputs:
|
for request_output in request_outputs:
|
||||||
self.request_tracker.process_request_output(
|
self._request_tracker.process_request_output(
|
||||||
request_output, verbose=self.log_requests)
|
request_output, verbose=self.log_requests)
|
||||||
|
|
||||||
|
return len(request_outputs) > 0
|
||||||
|
|
||||||
async def _engine_abort(self, request_ids: Iterable[str]):
|
async def _engine_abort(self, request_ids: Iterable[str]):
|
||||||
if self.engine_use_ray:
|
if self.engine_use_ray:
|
||||||
await self.engine.abort_request.remote(request_ids)
|
await self.engine.abort_request.remote(request_ids)
|
||||||
@@ -311,8 +351,12 @@ class AsyncLLMEngine:
|
|||||||
self.engine.abort_request(request_ids)
|
self.engine.abort_request(request_ids)
|
||||||
|
|
||||||
async def run_engine_loop(self):
|
async def run_engine_loop(self):
|
||||||
|
# Initialize the RequestTracker here so it uses the right event loop.
|
||||||
|
has_requests_in_progress = False
|
||||||
while True:
|
while True:
|
||||||
await self.engine_step()
|
if not has_requests_in_progress:
|
||||||
|
await self._request_tracker.wait_for_new_requests()
|
||||||
|
has_requests_in_progress = await self.engine_step()
|
||||||
await asyncio.sleep(0)
|
await asyncio.sleep(0)
|
||||||
|
|
||||||
async def add_request(
|
async def add_request(
|
||||||
@@ -324,19 +368,30 @@ class AsyncLLMEngine:
|
|||||||
arrival_time: Optional[float] = None,
|
arrival_time: Optional[float] = None,
|
||||||
) -> AsyncStream:
|
) -> AsyncStream:
|
||||||
if self.log_requests:
|
if self.log_requests:
|
||||||
|
shortened_prompt = prompt
|
||||||
|
shortened_token_ids = prompt_token_ids
|
||||||
|
if self.max_log_len is not None:
|
||||||
|
if shortened_prompt is not None:
|
||||||
|
shortened_prompt = shortened_prompt[:self.max_log_len]
|
||||||
|
if shortened_token_ids is not None:
|
||||||
|
shortened_token_ids = shortened_token_ids[:self.
|
||||||
|
max_log_len]
|
||||||
logger.info(f"Received request {request_id}: "
|
logger.info(f"Received request {request_id}: "
|
||||||
f"prompt: {prompt!r}, "
|
f"prompt: {shortened_prompt!r}, "
|
||||||
f"sampling params: {sampling_params}, "
|
f"sampling params: {sampling_params}, "
|
||||||
f"prompt token ids: {prompt_token_ids}.")
|
f"prompt token ids: {shortened_token_ids}.")
|
||||||
|
|
||||||
if not self.is_running:
|
if not self.is_running:
|
||||||
raise AsyncEngineDeadError(
|
if self.start_engine_loop:
|
||||||
"Background loop is not running. If it was running, "
|
self.start_background_loop()
|
||||||
"inspect the output to find the stacktrace of the "
|
else:
|
||||||
"error that caused the background loop to stop "
|
raise AsyncEngineDeadError(
|
||||||
"(AsyncEngineDeadError).")
|
"Background loop is not running. If it was running, "
|
||||||
|
"inspect the output to find the stacktrace of the "
|
||||||
|
"error that caused the background loop to stop "
|
||||||
|
"(AsyncEngineDeadError).")
|
||||||
|
|
||||||
stream = self.request_tracker.add_request(
|
stream = self._request_tracker.add_request(
|
||||||
request_id,
|
request_id,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
sampling_params=sampling_params,
|
sampling_params=sampling_params,
|
||||||
@@ -370,7 +425,8 @@ class AsyncLLMEngine:
|
|||||||
request.
|
request.
|
||||||
"""
|
"""
|
||||||
# Preprocess the request.
|
# Preprocess the request.
|
||||||
arrival_time = time.time()
|
# This should not be used for logging, as it is monotonic time.
|
||||||
|
arrival_time = time.monotonic()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
stream = await self.add_request(request_id,
|
stream = await self.add_request(request_id,
|
||||||
@@ -381,8 +437,9 @@ class AsyncLLMEngine:
|
|||||||
|
|
||||||
async for request_output in stream:
|
async for request_output in stream:
|
||||||
yield request_output
|
yield request_output
|
||||||
except Exception as e:
|
except (Exception, asyncio.CancelledError) as e:
|
||||||
# If there is an exception, abort the request.
|
# If there is an exception or coroutine is cancelled, abort the
|
||||||
|
# request.
|
||||||
self._abort(request_id)
|
self._abort(request_id)
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
@@ -413,8 +470,8 @@ class AsyncLLMEngine:
|
|||||||
Args:
|
Args:
|
||||||
request_id: The unique id of the request.
|
request_id: The unique id of the request.
|
||||||
"""
|
"""
|
||||||
self.request_tracker.abort_request(request_id,
|
self._request_tracker.abort_request(request_id,
|
||||||
verbose=self.log_requests)
|
verbose=self.log_requests)
|
||||||
|
|
||||||
async def get_model_config(self) -> ModelConfig:
|
async def get_model_config(self) -> ModelConfig:
|
||||||
"""Get the model configuration of the vLLM engine."""
|
"""Get the model configuration of the vLLM engine."""
|
||||||
@@ -426,7 +483,7 @@ class AsyncLLMEngine:
|
|||||||
@classmethod
|
@classmethod
|
||||||
def from_engine_args(cls,
|
def from_engine_args(cls,
|
||||||
engine_args: AsyncEngineArgs,
|
engine_args: AsyncEngineArgs,
|
||||||
start_engine_loop: bool = False) -> "AsyncLLMEngine":
|
start_engine_loop: bool = True) -> "AsyncLLMEngine":
|
||||||
"""Creates an async LLM engine from the engine arguments."""
|
"""Creates an async LLM engine from the engine arguments."""
|
||||||
# Create the engine configs.
|
# Create the engine configs.
|
||||||
engine_configs = engine_args.create_engine_configs()
|
engine_configs = engine_args.create_engine_configs()
|
||||||
@@ -435,12 +492,13 @@ class AsyncLLMEngine:
|
|||||||
distributed_init_method, placement_group = initialize_cluster(
|
distributed_init_method, placement_group = initialize_cluster(
|
||||||
parallel_config, engine_args.engine_use_ray)
|
parallel_config, engine_args.engine_use_ray)
|
||||||
# Create the async LLM engine.
|
# Create the async LLM engine.
|
||||||
engine = cls(engine_args.worker_use_ray,
|
engine = cls(parallel_config.worker_use_ray,
|
||||||
engine_args.engine_use_ray,
|
engine_args.engine_use_ray,
|
||||||
*engine_configs,
|
*engine_configs,
|
||||||
distributed_init_method,
|
distributed_init_method,
|
||||||
placement_group,
|
placement_group,
|
||||||
log_requests=not engine_args.disable_log_requests,
|
log_requests=not engine_args.disable_log_requests,
|
||||||
log_stats=not engine_args.disable_log_stats,
|
log_stats=not engine_args.disable_log_stats,
|
||||||
|
max_log_len=engine_args.max_log_len,
|
||||||
start_engine_loop=start_engine_loop)
|
start_engine_loop=start_engine_loop)
|
||||||
return engine
|
return engine
|
||||||
|
|||||||
@@ -7,13 +7,14 @@ from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
|
|||||||
SchedulerConfig)
|
SchedulerConfig)
|
||||||
from vllm.core.scheduler import Scheduler, SchedulerOutputs
|
from vllm.core.scheduler import Scheduler, SchedulerOutputs
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.engine.ray_utils import RayWorker, initialize_cluster, ray
|
from vllm.engine.metrics import record_metrics
|
||||||
|
from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.outputs import RequestOutput
|
from vllm.outputs import RequestOutput
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.sequence import (SamplerOutput, Sequence, SequenceGroup,
|
from vllm.sequence import (SamplerOutput, Sequence, SequenceGroup,
|
||||||
SequenceGroupMetadata, SequenceOutputs,
|
SequenceGroupMetadata, SequenceGroupOutput,
|
||||||
SequenceStatus)
|
SequenceOutput, SequenceStatus)
|
||||||
from vllm.transformers_utils.tokenizer import (detokenize_incrementally,
|
from vllm.transformers_utils.tokenizer import (detokenize_incrementally,
|
||||||
get_tokenizer)
|
get_tokenizer)
|
||||||
from vllm.utils import Counter
|
from vllm.utils import Counter
|
||||||
@@ -54,8 +55,8 @@ class LLMEngine:
|
|||||||
scheduler_config: The configuration related to the request scheduler.
|
scheduler_config: The configuration related to the request scheduler.
|
||||||
distributed_init_method: The initialization method for distributed
|
distributed_init_method: The initialization method for distributed
|
||||||
execution. See `torch.distributed.init_process_group` for details.
|
execution. See `torch.distributed.init_process_group` for details.
|
||||||
stage_devices: The list of devices for each stage. Each stage is a list
|
placement_group: Ray placement group for distributed execution.
|
||||||
of (rank, node_resource, device) tuples.
|
Required for distributed execution.
|
||||||
log_stats: Whether to log statistics.
|
log_stats: Whether to log statistics.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -74,11 +75,15 @@ class LLMEngine:
|
|||||||
f"model={model_config.model!r}, "
|
f"model={model_config.model!r}, "
|
||||||
f"tokenizer={model_config.tokenizer!r}, "
|
f"tokenizer={model_config.tokenizer!r}, "
|
||||||
f"tokenizer_mode={model_config.tokenizer_mode}, "
|
f"tokenizer_mode={model_config.tokenizer_mode}, "
|
||||||
|
f"revision={model_config.revision}, "
|
||||||
|
f"tokenizer_revision={model_config.tokenizer_revision}, "
|
||||||
f"trust_remote_code={model_config.trust_remote_code}, "
|
f"trust_remote_code={model_config.trust_remote_code}, "
|
||||||
f"dtype={model_config.dtype}, "
|
f"dtype={model_config.dtype}, "
|
||||||
|
f"max_seq_len={model_config.max_model_len}, "
|
||||||
f"download_dir={model_config.download_dir!r}, "
|
f"download_dir={model_config.download_dir!r}, "
|
||||||
f"load_format={model_config.load_format}, "
|
f"load_format={model_config.load_format}, "
|
||||||
f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
|
f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
|
||||||
|
f"quantization={model_config.quantization}, "
|
||||||
f"seed={model_config.seed})")
|
f"seed={model_config.seed})")
|
||||||
# TODO(woosuk): Print more configs in debug mode.
|
# TODO(woosuk): Print more configs in debug mode.
|
||||||
|
|
||||||
@@ -92,7 +97,9 @@ class LLMEngine:
|
|||||||
self.tokenizer = get_tokenizer(
|
self.tokenizer = get_tokenizer(
|
||||||
model_config.tokenizer,
|
model_config.tokenizer,
|
||||||
tokenizer_mode=model_config.tokenizer_mode,
|
tokenizer_mode=model_config.tokenizer_mode,
|
||||||
trust_remote_code=model_config.trust_remote_code)
|
trust_remote_code=model_config.trust_remote_code,
|
||||||
|
tokenizer_revision=model_config.tokenizer_revision,
|
||||||
|
revision=model_config.revision)
|
||||||
self.seq_counter = Counter()
|
self.seq_counter = Counter()
|
||||||
|
|
||||||
# Create the parallel GPU workers.
|
# Create the parallel GPU workers.
|
||||||
@@ -117,7 +124,7 @@ class LLMEngine:
|
|||||||
def _init_workers(self, distributed_init_method: str):
|
def _init_workers(self, distributed_init_method: str):
|
||||||
# Lazy import the Worker to avoid importing torch.cuda/xformers
|
# Lazy import the Worker to avoid importing torch.cuda/xformers
|
||||||
# before CUDA_VISIBLE_DEVICES is set in the Worker
|
# before CUDA_VISIBLE_DEVICES is set in the Worker
|
||||||
from vllm.worker.worker import Worker # pylint: disable=import-outside-toplevel
|
from vllm.worker.worker import Worker
|
||||||
|
|
||||||
assert self.parallel_config.world_size == 1, (
|
assert self.parallel_config.world_size == 1, (
|
||||||
"Ray is required if parallel_config.world_size > 1.")
|
"Ray is required if parallel_config.world_size > 1.")
|
||||||
@@ -135,25 +142,35 @@ class LLMEngine:
|
|||||||
"init_model",
|
"init_model",
|
||||||
get_all_outputs=True,
|
get_all_outputs=True,
|
||||||
)
|
)
|
||||||
|
self._run_workers(
|
||||||
|
"load_model",
|
||||||
|
get_all_outputs=True,
|
||||||
|
max_concurrent_workers=self.parallel_config.
|
||||||
|
max_parallel_loading_workers,
|
||||||
|
)
|
||||||
|
|
||||||
def _init_workers_ray(self, placement_group: "PlacementGroup",
|
def _init_workers_ray(self, placement_group: "PlacementGroup",
|
||||||
**ray_remote_kwargs):
|
**ray_remote_kwargs):
|
||||||
# Lazy import the Worker to avoid importing torch.cuda/xformers
|
# Lazy import the Worker to avoid importing torch.cuda/xformers
|
||||||
# before CUDA_VISIBLE_DEVICES is set in the Worker
|
# before CUDA_VISIBLE_DEVICES is set in the Worker
|
||||||
from vllm.worker.worker import Worker # pylint: disable=import-outside-toplevel
|
from vllm.worker.worker import Worker
|
||||||
|
|
||||||
self.workers: List[Worker] = []
|
self.workers: List[Worker] = []
|
||||||
for bundle in placement_group.bundle_specs:
|
for bundle in placement_group.bundle_specs:
|
||||||
if not bundle.get("GPU", 0):
|
if not bundle.get("GPU", 0):
|
||||||
continue
|
continue
|
||||||
|
if self.parallel_config.tensor_parallel_size == 1:
|
||||||
|
num_gpus = self.cache_config.gpu_memory_utilization
|
||||||
|
else:
|
||||||
|
num_gpus = 1
|
||||||
worker = ray.remote(
|
worker = ray.remote(
|
||||||
num_cpus=0,
|
num_cpus=0,
|
||||||
num_gpus=1,
|
num_gpus=num_gpus,
|
||||||
scheduling_strategy=PlacementGroupSchedulingStrategy(
|
scheduling_strategy=PlacementGroupSchedulingStrategy(
|
||||||
placement_group=placement_group,
|
placement_group=placement_group,
|
||||||
placement_group_capture_child_tasks=True),
|
placement_group_capture_child_tasks=True),
|
||||||
**ray_remote_kwargs,
|
**ray_remote_kwargs,
|
||||||
)(RayWorker).remote()
|
)(RayWorkerVllm).remote(self.model_config.trust_remote_code)
|
||||||
self.workers.append(worker)
|
self.workers.append(worker)
|
||||||
|
|
||||||
# Initialize torch distributed process group for the workers.
|
# Initialize torch distributed process group for the workers.
|
||||||
@@ -174,6 +191,12 @@ class LLMEngine:
|
|||||||
"init_model",
|
"init_model",
|
||||||
get_all_outputs=True,
|
get_all_outputs=True,
|
||||||
)
|
)
|
||||||
|
self._run_workers(
|
||||||
|
"load_model",
|
||||||
|
get_all_outputs=True,
|
||||||
|
max_concurrent_workers=self.parallel_config.
|
||||||
|
max_parallel_loading_workers,
|
||||||
|
)
|
||||||
|
|
||||||
def _verify_args(self) -> None:
|
def _verify_args(self) -> None:
|
||||||
self.model_config.verify_with_parallel_config(self.parallel_config)
|
self.model_config.verify_with_parallel_config(self.parallel_config)
|
||||||
@@ -248,10 +271,10 @@ class LLMEngine:
|
|||||||
prompt_token_ids: The token IDs of the prompt. If None, we
|
prompt_token_ids: The token IDs of the prompt. If None, we
|
||||||
use the tokenizer to convert the prompts to token IDs.
|
use the tokenizer to convert the prompts to token IDs.
|
||||||
arrival_time: The arrival time of the request. If None, we use
|
arrival_time: The arrival time of the request. If None, we use
|
||||||
the current time.
|
the current monotonic time.
|
||||||
"""
|
"""
|
||||||
if arrival_time is None:
|
if arrival_time is None:
|
||||||
arrival_time = time.time()
|
arrival_time = time.monotonic()
|
||||||
if prompt_token_ids is None:
|
if prompt_token_ids is None:
|
||||||
assert prompt is not None
|
assert prompt is not None
|
||||||
prompt_token_ids = self.tokenizer.encode(prompt)
|
prompt_token_ids = self.tokenizer.encode(prompt)
|
||||||
@@ -291,14 +314,12 @@ class LLMEngine:
|
|||||||
def _schedule(
|
def _schedule(
|
||||||
self
|
self
|
||||||
) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs,
|
) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs,
|
||||||
Optional[List[RequestOutput]]]:
|
List[RequestOutput]]:
|
||||||
seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
|
seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
|
||||||
if scheduler_outputs.is_empty():
|
return seq_group_metadata_list, scheduler_outputs, [
|
||||||
return seq_group_metadata_list, scheduler_outputs, [
|
RequestOutput.from_seq_group(seq_group)
|
||||||
RequestOutput.from_seq_group(seq_group)
|
for seq_group in scheduler_outputs.ignored_seq_groups
|
||||||
for seq_group in scheduler_outputs.ignored_seq_groups
|
]
|
||||||
]
|
|
||||||
return seq_group_metadata_list, scheduler_outputs, None
|
|
||||||
|
|
||||||
def _check_beam_search_early_stopping(
|
def _check_beam_search_early_stopping(
|
||||||
self,
|
self,
|
||||||
@@ -344,9 +365,15 @@ class LLMEngine:
|
|||||||
eos_token_id=self.tokenizer.eos_token_id))
|
eos_token_id=self.tokenizer.eos_token_id))
|
||||||
return current_worst_score >= highest_attainable_score
|
return current_worst_score >= highest_attainable_score
|
||||||
|
|
||||||
def _process_sequence_group_samples(
|
def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
|
||||||
self, seq_group: SequenceGroup,
|
outputs: SequenceGroupOutput) -> None:
|
||||||
samples: List[SequenceOutputs]) -> None:
|
# Process prompt logprobs
|
||||||
|
prompt_logprobs = outputs.prompt_logprobs
|
||||||
|
if prompt_logprobs is not None:
|
||||||
|
seq_group.prompt_logprobs = prompt_logprobs
|
||||||
|
|
||||||
|
# Process samples
|
||||||
|
samples = outputs.samples
|
||||||
parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
|
parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
|
||||||
existing_finished_seqs = seq_group.get_finished_seqs()
|
existing_finished_seqs = seq_group.get_finished_seqs()
|
||||||
parent_child_dict = {
|
parent_child_dict = {
|
||||||
@@ -360,7 +387,7 @@ class LLMEngine:
|
|||||||
|
|
||||||
# Process the child samples for each parent sequence
|
# Process the child samples for each parent sequence
|
||||||
for parent in parent_seqs:
|
for parent in parent_seqs:
|
||||||
child_samples: List[SequenceOutputs] = parent_child_dict[
|
child_samples: List[SequenceOutput] = parent_child_dict[
|
||||||
parent.seq_id]
|
parent.seq_id]
|
||||||
if len(child_samples) == 0:
|
if len(child_samples) == 0:
|
||||||
# This parent sequence has no children samples. Remove
|
# This parent sequence has no children samples. Remove
|
||||||
@@ -386,7 +413,7 @@ class LLMEngine:
|
|||||||
child_seqs.append((parent, parent))
|
child_seqs.append((parent, parent))
|
||||||
|
|
||||||
for seq, _ in child_seqs:
|
for seq, _ in child_seqs:
|
||||||
self._decode_sequence(seq)
|
self._decode_sequence(seq, seq_group.sampling_params)
|
||||||
self._check_stop(seq, seq_group.sampling_params)
|
self._check_stop(seq, seq_group.sampling_params)
|
||||||
|
|
||||||
# Non-beam search case
|
# Non-beam search case
|
||||||
@@ -514,8 +541,8 @@ class LLMEngine:
|
|||||||
scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]:
|
scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]:
|
||||||
# Update the scheduled sequence groups with the model outputs.
|
# Update the scheduled sequence groups with the model outputs.
|
||||||
scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups
|
scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups
|
||||||
for seq_group, samples in zip(scheduled_seq_groups, output):
|
for seq_group, outputs in zip(scheduled_seq_groups, output):
|
||||||
self._process_sequence_group_samples(seq_group, samples)
|
self._process_sequence_group_outputs(seq_group, outputs)
|
||||||
|
|
||||||
# Free the finished sequence groups.
|
# Free the finished sequence groups.
|
||||||
self.scheduler.free_finished_seq_groups()
|
self.scheduler.free_finished_seq_groups()
|
||||||
@@ -542,10 +569,9 @@ class LLMEngine:
|
|||||||
and updates the scheduler with the model outputs. Finally, it decodes
|
and updates the scheduler with the model outputs. Finally, it decodes
|
||||||
the sequences and returns the newly generated results.
|
the sequences and returns the newly generated results.
|
||||||
"""
|
"""
|
||||||
(seq_group_metadata_list, scheduler_outputs,
|
seq_group_metadata_list, scheduler_outputs, ignored = self._schedule()
|
||||||
early_return) = self._schedule()
|
if scheduler_outputs.is_empty():
|
||||||
if early_return is not None:
|
return ignored
|
||||||
return early_return
|
|
||||||
|
|
||||||
# Execute the model.
|
# Execute the model.
|
||||||
output = self._run_workers(
|
output = self._run_workers(
|
||||||
@@ -563,15 +589,15 @@ class LLMEngine:
|
|||||||
prompt_run: bool,
|
prompt_run: bool,
|
||||||
num_batched_tokens: int,
|
num_batched_tokens: int,
|
||||||
) -> None:
|
) -> None:
|
||||||
now = time.time()
|
now = time.monotonic()
|
||||||
# Log the number of batched input tokens.
|
# Log the number of batched input tokens.
|
||||||
if prompt_run:
|
if prompt_run:
|
||||||
self.num_prompt_tokens.append((now, num_batched_tokens))
|
self.num_prompt_tokens.append((now, num_batched_tokens))
|
||||||
else:
|
else:
|
||||||
self.num_generation_tokens.append((now, num_batched_tokens))
|
self.num_generation_tokens.append((now, num_batched_tokens))
|
||||||
|
|
||||||
elapsed_time = now - self.last_logging_time
|
should_log = now - self.last_logging_time >= _LOGGING_INTERVAL_SEC
|
||||||
if elapsed_time < _LOGGING_INTERVAL_SEC:
|
if not should_log:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Discard the old stats.
|
# Discard the old stats.
|
||||||
@@ -610,6 +636,16 @@ class LLMEngine:
|
|||||||
else:
|
else:
|
||||||
cpu_cache_usage = 0.0
|
cpu_cache_usage = 0.0
|
||||||
|
|
||||||
|
record_metrics(
|
||||||
|
avg_prompt_throughput=avg_prompt_throughput,
|
||||||
|
avg_generation_throughput=avg_generation_throughput,
|
||||||
|
scheduler_running=len(self.scheduler.running),
|
||||||
|
scheduler_swapped=len(self.scheduler.swapped),
|
||||||
|
scheduler_waiting=len(self.scheduler.waiting),
|
||||||
|
gpu_cache_usage=gpu_cache_usage,
|
||||||
|
cpu_cache_usage=cpu_cache_usage,
|
||||||
|
)
|
||||||
|
|
||||||
logger.info("Avg prompt throughput: "
|
logger.info("Avg prompt throughput: "
|
||||||
f"{avg_prompt_throughput:.1f} tokens/s, "
|
f"{avg_prompt_throughput:.1f} tokens/s, "
|
||||||
"Avg generation throughput: "
|
"Avg generation throughput: "
|
||||||
@@ -621,17 +657,25 @@ class LLMEngine:
|
|||||||
f"CPU KV cache usage: {cpu_cache_usage * 100:.1f}%")
|
f"CPU KV cache usage: {cpu_cache_usage * 100:.1f}%")
|
||||||
self.last_logging_time = now
|
self.last_logging_time = now
|
||||||
|
|
||||||
def _decode_sequence(self, seq: Sequence) -> None:
|
def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None:
|
||||||
"""Decodes the new token for a sequence."""
|
"""Decodes the new token for a sequence."""
|
||||||
new_token, new_output_text = detokenize_incrementally(
|
(new_tokens, new_output_text, prefix_offset,
|
||||||
self.tokenizer,
|
read_offset) = detokenize_incrementally(
|
||||||
seq.output_tokens,
|
self.tokenizer,
|
||||||
seq.get_last_token_id(),
|
all_input_ids=seq.get_token_ids(),
|
||||||
skip_special_tokens=True,
|
prev_tokens=seq.tokens,
|
||||||
)
|
prefix_offset=seq.prefix_offset,
|
||||||
if new_token is not None:
|
read_offset=seq.read_offset,
|
||||||
seq.output_tokens.append(new_token)
|
skip_special_tokens=prms.skip_special_tokens,
|
||||||
seq.output_text = new_output_text
|
spaces_between_special_tokens=prms.spaces_between_special_tokens,
|
||||||
|
)
|
||||||
|
if seq.tokens is None:
|
||||||
|
seq.tokens = new_tokens
|
||||||
|
else:
|
||||||
|
seq.tokens.extend(new_tokens)
|
||||||
|
seq.prefix_offset = prefix_offset
|
||||||
|
seq.read_offset = read_offset
|
||||||
|
seq.output_text += new_output_text
|
||||||
|
|
||||||
def _check_stop(self, seq: Sequence,
|
def _check_stop(self, seq: Sequence,
|
||||||
sampling_params: SamplingParams) -> None:
|
sampling_params: SamplingParams) -> None:
|
||||||
@@ -643,6 +687,9 @@ class LLMEngine:
|
|||||||
seq.output_text = seq.output_text[:-len(stop_str)]
|
seq.output_text = seq.output_text[:-len(stop_str)]
|
||||||
seq.status = SequenceStatus.FINISHED_STOPPED
|
seq.status = SequenceStatus.FINISHED_STOPPED
|
||||||
return
|
return
|
||||||
|
if seq.get_last_token_id() in sampling_params.stop_token_ids:
|
||||||
|
seq.status = SequenceStatus.FINISHED_STOPPED
|
||||||
|
return
|
||||||
|
|
||||||
# Check if the sequence has reached max_model_len.
|
# Check if the sequence has reached max_model_len.
|
||||||
if seq.get_len() > self.scheduler_config.max_model_len:
|
if seq.get_len() > self.scheduler_config.max_model_len:
|
||||||
@@ -660,16 +707,15 @@ class LLMEngine:
|
|||||||
seq.status = SequenceStatus.FINISHED_STOPPED
|
seq.status = SequenceStatus.FINISHED_STOPPED
|
||||||
return
|
return
|
||||||
|
|
||||||
def _run_workers(
|
def _run_workers_in_batch(
|
||||||
self,
|
self,
|
||||||
|
workers,
|
||||||
method: str,
|
method: str,
|
||||||
*args,
|
*args,
|
||||||
get_all_outputs: bool = False,
|
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> Any:
|
):
|
||||||
"""Runs the given method on all workers."""
|
|
||||||
all_outputs = []
|
all_outputs = []
|
||||||
for worker in self.workers:
|
for worker in workers:
|
||||||
if self.parallel_config.worker_use_ray:
|
if self.parallel_config.worker_use_ray:
|
||||||
executor = partial(worker.execute_method.remote, method)
|
executor = partial(worker.execute_method.remote, method)
|
||||||
else:
|
else:
|
||||||
@@ -677,9 +723,31 @@ class LLMEngine:
|
|||||||
|
|
||||||
output = executor(*args, **kwargs)
|
output = executor(*args, **kwargs)
|
||||||
all_outputs.append(output)
|
all_outputs.append(output)
|
||||||
|
|
||||||
if self.parallel_config.worker_use_ray:
|
if self.parallel_config.worker_use_ray:
|
||||||
all_outputs = ray.get(all_outputs)
|
all_outputs = ray.get(all_outputs)
|
||||||
|
return all_outputs
|
||||||
|
|
||||||
|
def _run_workers(
|
||||||
|
self,
|
||||||
|
method: str,
|
||||||
|
*args,
|
||||||
|
get_all_outputs: bool = False,
|
||||||
|
max_concurrent_workers: Optional[int] = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> Any:
|
||||||
|
"""Runs the given method on all workers."""
|
||||||
|
all_outputs = []
|
||||||
|
if max_concurrent_workers:
|
||||||
|
work_groups = [
|
||||||
|
self.workers[i:i + max_concurrent_workers]
|
||||||
|
for i in range(0, len(self.workers), max_concurrent_workers)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
work_groups = [self.workers]
|
||||||
|
|
||||||
|
for workers in work_groups:
|
||||||
|
all_outputs.extend(
|
||||||
|
self._run_workers_in_batch(workers, method, *args, **kwargs))
|
||||||
|
|
||||||
if get_all_outputs:
|
if get_all_outputs:
|
||||||
return all_outputs
|
return all_outputs
|
||||||
|
|||||||
51
vllm/engine/metrics.py
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
from aioprometheus import Gauge
|
||||||
|
|
||||||
|
# The begin-* and end* here are used by the documentation generator
|
||||||
|
# to extract the metrics definitions.
|
||||||
|
|
||||||
|
# begin-metrics-definitions
|
||||||
|
gauge_avg_prompt_throughput = Gauge("vllm:avg_prompt_throughput_toks_per_s",
|
||||||
|
"Average prefill throughput in tokens/s.")
|
||||||
|
gauge_avg_generation_throughput = Gauge(
|
||||||
|
"vllm:avg_generation_throughput_toks_per_s",
|
||||||
|
"Average generation throughput in tokens/s.")
|
||||||
|
|
||||||
|
gauge_scheduler_running = Gauge(
|
||||||
|
"vllm:num_requests_running",
|
||||||
|
"Number of requests that is currently running for inference.")
|
||||||
|
gauge_scheduler_swapped = Gauge("vllm:num_requests_swapped",
|
||||||
|
"Number requests swapped to CPU.")
|
||||||
|
gauge_scheduler_waiting = Gauge("vllm:num_requests_waiting",
|
||||||
|
"Number of requests waiting to be processed.")
|
||||||
|
|
||||||
|
gauge_gpu_cache_usage = Gauge(
|
||||||
|
"vllm:gpu_cache_usage_perc",
|
||||||
|
"GPU KV-cache usage. 1 means 100 percent usage.")
|
||||||
|
gauge_cpu_cache_usage = Gauge(
|
||||||
|
"vllm:cpu_cache_usage_perc",
|
||||||
|
"CPU KV-cache usage. 1 means 100 percent usage.")
|
||||||
|
# end-metrics-definitions
|
||||||
|
|
||||||
|
labels = {}
|
||||||
|
|
||||||
|
|
||||||
|
def add_global_metrics_labels(**kwargs):
|
||||||
|
labels.update(kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def record_metrics(
|
||||||
|
avg_prompt_throughput: float,
|
||||||
|
avg_generation_throughput: float,
|
||||||
|
scheduler_running: int,
|
||||||
|
scheduler_swapped: int,
|
||||||
|
scheduler_waiting: int,
|
||||||
|
gpu_cache_usage: float,
|
||||||
|
cpu_cache_usage: float,
|
||||||
|
):
|
||||||
|
gauge_avg_prompt_throughput.set(labels, avg_prompt_throughput)
|
||||||
|
gauge_avg_generation_throughput.set(labels, avg_generation_throughput)
|
||||||
|
gauge_scheduler_running.set(labels, scheduler_running)
|
||||||
|
gauge_scheduler_swapped.set(labels, scheduler_swapped)
|
||||||
|
gauge_scheduler_waiting.set(labels, scheduler_waiting)
|
||||||
|
gauge_gpu_cache_usage.set(labels, gpu_cache_usage)
|
||||||
|
gauge_cpu_cache_usage.set(labels, cpu_cache_usage)
|
||||||
@@ -2,16 +2,22 @@ import socket
|
|||||||
from typing import Optional, Tuple, TYPE_CHECKING
|
from typing import Optional, Tuple, TYPE_CHECKING
|
||||||
|
|
||||||
from vllm.config import ParallelConfig
|
from vllm.config import ParallelConfig
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import ray
|
import ray
|
||||||
from ray.air.util.torch_dist import TorchDistributedWorker
|
from ray.air.util.torch_dist import TorchDistributedWorker
|
||||||
|
|
||||||
class RayWorker(TorchDistributedWorker):
|
class RayWorkerVllm(TorchDistributedWorker):
|
||||||
"""Ray wrapper for vllm.worker.Worker, allowing Worker to be
|
"""Ray wrapper for vllm.worker.Worker, allowing Worker to be
|
||||||
lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""
|
lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self, init_cached_hf_modules=False) -> None:
|
||||||
|
if init_cached_hf_modules:
|
||||||
|
from transformers.dynamic_module_utils import init_hf_modules
|
||||||
|
init_hf_modules()
|
||||||
self.worker = None
|
self.worker = None
|
||||||
|
|
||||||
def init_worker(self, worker_init_fn):
|
def init_worker(self, worker_init_fn):
|
||||||
@@ -24,10 +30,13 @@ try:
|
|||||||
executor = getattr(self, method)
|
executor = getattr(self, method)
|
||||||
return executor(*args, **kwargs)
|
return executor(*args, **kwargs)
|
||||||
|
|
||||||
except ImportError:
|
except ImportError as e:
|
||||||
|
logger.warning(f"Failed to import Ray with {e!r}. "
|
||||||
|
"For distributed inference, please install Ray with "
|
||||||
|
"`pip install ray pandas pyarrow`.")
|
||||||
ray = None
|
ray = None
|
||||||
TorchDistributedWorker = None
|
TorchDistributedWorker = None
|
||||||
RayWorker = None # pylint: disable=invalid-name
|
RayWorkerVllm = None
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from ray.util.placement_group import PlacementGroup
|
from ray.util.placement_group import PlacementGroup
|
||||||
@@ -53,11 +62,10 @@ def initialize_cluster(
|
|||||||
the default Ray cluster address.
|
the default Ray cluster address.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A tuple of (`distributed_init_method`, `all_stage_devices`). The
|
A tuple of (`distributed_init_method`, `placement_group`). The
|
||||||
`distributed_init_method` is the address for initializing the
|
`distributed_init_method` is the address for initializing the
|
||||||
distributed backend. `all_stage_devices` includes device IDs for
|
distributed backend. `placement_group` includes the specification
|
||||||
each worker in each pipeline stage. Each device ID is a tuple of
|
of the resources for each distributed worker.
|
||||||
(rank, node resource, device id).
|
|
||||||
"""
|
"""
|
||||||
if parallel_config.worker_use_ray or engine_use_ray:
|
if parallel_config.worker_use_ray or engine_use_ray:
|
||||||
if ray is None:
|
if ray is None:
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import argparse
|
|||||||
import json
|
import json
|
||||||
from typing import AsyncGenerator
|
from typing import AsyncGenerator
|
||||||
|
|
||||||
from fastapi import BackgroundTasks, FastAPI, Request
|
from fastapi import FastAPI, Request
|
||||||
from fastapi.responses import JSONResponse, Response, StreamingResponse
|
from fastapi.responses import JSONResponse, Response, StreamingResponse
|
||||||
import uvicorn
|
import uvicorn
|
||||||
|
|
||||||
@@ -17,6 +17,12 @@ app = FastAPI()
|
|||||||
engine = None
|
engine = None
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
async def health() -> Response:
|
||||||
|
"""Health check."""
|
||||||
|
return Response(status_code=200)
|
||||||
|
|
||||||
|
|
||||||
@app.post("/generate")
|
@app.post("/generate")
|
||||||
async def generate(request: Request) -> Response:
|
async def generate(request: Request) -> Response:
|
||||||
"""Generate completion for the request.
|
"""Generate completion for the request.
|
||||||
@@ -32,9 +38,6 @@ async def generate(request: Request) -> Response:
|
|||||||
sampling_params = SamplingParams(**request_dict)
|
sampling_params = SamplingParams(**request_dict)
|
||||||
request_id = random_uuid()
|
request_id = random_uuid()
|
||||||
|
|
||||||
if not engine.is_running:
|
|
||||||
engine.start_background_loop()
|
|
||||||
|
|
||||||
results_generator = engine.generate(prompt, sampling_params, request_id)
|
results_generator = engine.generate(prompt, sampling_params, request_id)
|
||||||
|
|
||||||
# Streaming case
|
# Streaming case
|
||||||
@@ -47,14 +50,8 @@ async def generate(request: Request) -> Response:
|
|||||||
ret = {"text": text_outputs}
|
ret = {"text": text_outputs}
|
||||||
yield (json.dumps(ret) + "\0").encode("utf-8")
|
yield (json.dumps(ret) + "\0").encode("utf-8")
|
||||||
|
|
||||||
async def abort_request() -> None:
|
|
||||||
await engine.abort(request_id)
|
|
||||||
|
|
||||||
if stream:
|
if stream:
|
||||||
background_tasks = BackgroundTasks()
|
return StreamingResponse(stream_results())
|
||||||
# Abort the request if the client disconnects.
|
|
||||||
background_tasks.add_task(abort_request)
|
|
||||||
return StreamingResponse(stream_results(), background=background_tasks)
|
|
||||||
|
|
||||||
# Non-streaming case
|
# Non-streaming case
|
||||||
final_output = None
|
final_output = None
|
||||||
@@ -74,14 +71,13 @@ async def generate(request: Request) -> Response:
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--host", type=str, default="localhost")
|
parser.add_argument("--host", type=str, default=None)
|
||||||
parser.add_argument("--port", type=int, default=8000)
|
parser.add_argument("--port", type=int, default=8000)
|
||||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
parser = AsyncEngineArgs.add_cli_args(parser)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
engine_args = AsyncEngineArgs.from_cli_args(args)
|
engine_args = AsyncEngineArgs.from_cli_args(args)
|
||||||
engine = AsyncLLMEngine.from_engine_args(engine_args,
|
engine = AsyncLLMEngine.from_engine_args(engine_args)
|
||||||
start_engine_loop=False)
|
|
||||||
|
|
||||||
uvicorn.run(app,
|
uvicorn.run(app,
|
||||||
host=args.host,
|
host=args.host,
|
||||||
|
|||||||
@@ -37,7 +37,24 @@ class LLM:
|
|||||||
the `torch_dtype` attribute specified in the model config file.
|
the `torch_dtype` attribute specified in the model config file.
|
||||||
However, if the `torch_dtype` in the config is `float32`, we will
|
However, if the `torch_dtype` in the config is `float32`, we will
|
||||||
use `float16` instead.
|
use `float16` instead.
|
||||||
|
quantization: The method used to quantize the model weights. Currently,
|
||||||
|
we support "awq". If None, we assume the model weights are not
|
||||||
|
quantized and use `dtype` to determine the data type of the weights.
|
||||||
|
revision: The specific model version to use. It can be a branch name,
|
||||||
|
a tag name, or a commit id.
|
||||||
|
tokenizer_revision: The specific tokenizer version to use. It can be a
|
||||||
|
branch name, a tag name, or a commit id.
|
||||||
seed: The seed to initialize the random number generator for sampling.
|
seed: The seed to initialize the random number generator for sampling.
|
||||||
|
gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
|
||||||
|
reserve for the model weights, activations, and KV cache. Higher
|
||||||
|
values will increase the KV cache size and thus improve the model's
|
||||||
|
throughput. However, if the value is too high, it may cause out-of-
|
||||||
|
memory (OOM) errors.
|
||||||
|
swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
|
||||||
|
This can be used for temporarily storing the states of the requests
|
||||||
|
when their `best_of` sampling parameters are larger than 1. If all
|
||||||
|
requests will have `best_of=1`, you can safely set this to 0.
|
||||||
|
Otherwise, too small values may cause out-of-memory (OOM) errors.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -48,7 +65,12 @@ class LLM:
|
|||||||
trust_remote_code: bool = False,
|
trust_remote_code: bool = False,
|
||||||
tensor_parallel_size: int = 1,
|
tensor_parallel_size: int = 1,
|
||||||
dtype: str = "auto",
|
dtype: str = "auto",
|
||||||
|
quantization: Optional[str] = None,
|
||||||
|
revision: Optional[str] = None,
|
||||||
|
tokenizer_revision: Optional[str] = None,
|
||||||
seed: int = 0,
|
seed: int = 0,
|
||||||
|
gpu_memory_utilization: float = 0.9,
|
||||||
|
swap_space: int = 4,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
if "disable_log_stats" not in kwargs:
|
if "disable_log_stats" not in kwargs:
|
||||||
@@ -60,7 +82,12 @@ class LLM:
|
|||||||
trust_remote_code=trust_remote_code,
|
trust_remote_code=trust_remote_code,
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
|
quantization=quantization,
|
||||||
|
revision=revision,
|
||||||
|
tokenizer_revision=tokenizer_revision,
|
||||||
seed=seed,
|
seed=seed,
|
||||||
|
gpu_memory_utilization=gpu_memory_utilization,
|
||||||
|
swap_space=swap_space,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
self.llm_engine = LLMEngine.from_engine_args(engine_args)
|
self.llm_engine = LLMEngine.from_engine_args(engine_args)
|
||||||
@@ -107,25 +134,21 @@ class LLM:
|
|||||||
if isinstance(prompts, str):
|
if isinstance(prompts, str):
|
||||||
# Convert a single prompt to a list.
|
# Convert a single prompt to a list.
|
||||||
prompts = [prompts]
|
prompts = [prompts]
|
||||||
if prompts is not None and prompt_token_ids is not None:
|
if (prompts is not None and prompt_token_ids is not None
|
||||||
if len(prompts) != len(prompt_token_ids):
|
and len(prompts) != len(prompt_token_ids)):
|
||||||
raise ValueError("The lengths of prompts and prompt_token_ids "
|
raise ValueError("The lengths of prompts and prompt_token_ids "
|
||||||
"must be the same.")
|
"must be the same.")
|
||||||
if sampling_params is None:
|
if sampling_params is None:
|
||||||
# Use default sampling params.
|
# Use default sampling params.
|
||||||
sampling_params = SamplingParams()
|
sampling_params = SamplingParams()
|
||||||
|
|
||||||
# Add requests to the engine.
|
# Add requests to the engine.
|
||||||
if prompts is not None:
|
num_requests = len(prompts) if prompts is not None else len(
|
||||||
num_requests = len(prompts)
|
prompt_token_ids)
|
||||||
else:
|
|
||||||
num_requests = len(prompt_token_ids)
|
|
||||||
for i in range(num_requests):
|
for i in range(num_requests):
|
||||||
prompt = prompts[i] if prompts is not None else None
|
prompt = prompts[i] if prompts is not None else None
|
||||||
if prompt_token_ids is None:
|
token_ids = None if prompt_token_ids is None else prompt_token_ids[
|
||||||
token_ids = None
|
i]
|
||||||
else:
|
|
||||||
token_ids = prompt_token_ids[i]
|
|
||||||
self._add_request(prompt, sampling_params, token_ids)
|
self._add_request(prompt, sampling_params, token_ids)
|
||||||
return self._run_engine(use_tqdm)
|
return self._run_engine(use_tqdm)
|
||||||
|
|
||||||
|
|||||||
@@ -3,21 +3,24 @@
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import codecs
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union
|
from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
from aioprometheus import MetricsMiddleware
|
||||||
|
from aioprometheus.asgi.starlette import metrics
|
||||||
import fastapi
|
import fastapi
|
||||||
import uvicorn
|
import uvicorn
|
||||||
from fastapi import BackgroundTasks, Request
|
from fastapi import Request
|
||||||
from fastapi.exceptions import RequestValidationError
|
from fastapi.exceptions import RequestValidationError
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from fastapi.responses import JSONResponse, StreamingResponse
|
from fastapi.responses import JSONResponse, StreamingResponse, Response
|
||||||
from packaging import version
|
|
||||||
|
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||||
|
from vllm.engine.metrics import add_global_metrics_labels
|
||||||
from vllm.entrypoints.openai.protocol import (
|
from vllm.entrypoints.openai.protocol import (
|
||||||
CompletionRequest, CompletionResponse, CompletionResponseChoice,
|
CompletionRequest, CompletionResponse, CompletionResponseChoice,
|
||||||
CompletionResponseStreamChoice, CompletionStreamResponse,
|
CompletionResponseStreamChoice, CompletionStreamResponse,
|
||||||
@@ -31,20 +34,59 @@ from vllm.sampling_params import SamplingParams
|
|||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
from vllm.utils import random_uuid
|
from vllm.utils import random_uuid
|
||||||
|
|
||||||
try:
|
|
||||||
import fastchat
|
|
||||||
from fastchat.conversation import Conversation, SeparatorStyle
|
|
||||||
from fastchat.model.model_adapter import get_conversation_template
|
|
||||||
_fastchat_available = True
|
|
||||||
except ImportError:
|
|
||||||
_fastchat_available = False
|
|
||||||
|
|
||||||
TIMEOUT_KEEP_ALIVE = 5 # seconds
|
TIMEOUT_KEEP_ALIVE = 5 # seconds
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
served_model = None
|
served_model = None
|
||||||
app = fastapi.FastAPI()
|
app = fastapi.FastAPI()
|
||||||
engine = None
|
engine = None
|
||||||
|
response_role = None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="vLLM OpenAI-Compatible RESTful API server.")
|
||||||
|
parser.add_argument("--host", type=str, default=None, help="host name")
|
||||||
|
parser.add_argument("--port", type=int, default=8000, help="port number")
|
||||||
|
parser.add_argument("--allow-credentials",
|
||||||
|
action="store_true",
|
||||||
|
help="allow credentials")
|
||||||
|
parser.add_argument("--allowed-origins",
|
||||||
|
type=json.loads,
|
||||||
|
default=["*"],
|
||||||
|
help="allowed origins")
|
||||||
|
parser.add_argument("--allowed-methods",
|
||||||
|
type=json.loads,
|
||||||
|
default=["*"],
|
||||||
|
help="allowed methods")
|
||||||
|
parser.add_argument("--allowed-headers",
|
||||||
|
type=json.loads,
|
||||||
|
default=["*"],
|
||||||
|
help="allowed headers")
|
||||||
|
parser.add_argument("--served-model-name",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="The model name used in the API. If not "
|
||||||
|
"specified, the model name will be the same as "
|
||||||
|
"the huggingface name.")
|
||||||
|
parser.add_argument("--chat-template",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="The file path to the chat template, "
|
||||||
|
"or the template in single-line form "
|
||||||
|
"for the specified model")
|
||||||
|
parser.add_argument("--response-role",
|
||||||
|
type=str,
|
||||||
|
default="assistant",
|
||||||
|
help="The role name to return if "
|
||||||
|
"`request.add_generation_prompt=true`.")
|
||||||
|
|
||||||
|
parser = AsyncEngineArgs.add_cli_args(parser)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
app.add_middleware(MetricsMiddleware) # Trace HTTP server metrics
|
||||||
|
app.add_route("/metrics", metrics) # Exposes HTTP metrics
|
||||||
|
|
||||||
|
|
||||||
def create_error_response(status_code: HTTPStatus,
|
def create_error_response(status_code: HTTPStatus,
|
||||||
@@ -54,8 +96,27 @@ def create_error_response(status_code: HTTPStatus,
|
|||||||
status_code=status_code.value)
|
status_code=status_code.value)
|
||||||
|
|
||||||
|
|
||||||
|
def load_chat_template(args, tokenizer):
|
||||||
|
if args.chat_template is not None:
|
||||||
|
try:
|
||||||
|
with open(args.chat_template, "r") as f:
|
||||||
|
chat_template = f.read()
|
||||||
|
except OSError:
|
||||||
|
# If opening a file fails, set chat template to be args to
|
||||||
|
# ensure we decode so our escape are interpreted correctly
|
||||||
|
chat_template = codecs.decode(args.chat_template, "unicode_escape")
|
||||||
|
|
||||||
|
tokenizer.chat_template = chat_template
|
||||||
|
logger.info(
|
||||||
|
f"Using supplied chat template:\n{tokenizer.chat_template}")
|
||||||
|
elif tokenizer.chat_template is not None:
|
||||||
|
logger.info(f"Using default chat template:\n{tokenizer.chat_template}")
|
||||||
|
else:
|
||||||
|
logger.warning("No chat template provided. Chat API will not work.")
|
||||||
|
|
||||||
|
|
||||||
@app.exception_handler(RequestValidationError)
|
@app.exception_handler(RequestValidationError)
|
||||||
async def validation_exception_handler(request, exc): # pylint: disable=unused-argument
|
async def validation_exception_handler(_, exc):
|
||||||
return create_error_response(HTTPStatus.BAD_REQUEST, str(exc))
|
return create_error_response(HTTPStatus.BAD_REQUEST, str(exc))
|
||||||
|
|
||||||
|
|
||||||
@@ -69,53 +130,6 @@ async def check_model(request) -> Optional[JSONResponse]:
|
|||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
async def get_gen_prompt(request) -> str:
|
|
||||||
if not _fastchat_available:
|
|
||||||
raise ModuleNotFoundError(
|
|
||||||
"fastchat is not installed. Please install fastchat to use "
|
|
||||||
"the chat completion and conversation APIs: `$ pip install fschat`"
|
|
||||||
)
|
|
||||||
if version.parse(fastchat.__version__) < version.parse("0.2.23"):
|
|
||||||
raise ImportError(
|
|
||||||
f"fastchat version is low. Current version: {fastchat.__version__} "
|
|
||||||
"Please upgrade fastchat to use: `$ pip install -U fschat`")
|
|
||||||
|
|
||||||
conv = get_conversation_template(request.model)
|
|
||||||
conv = Conversation(
|
|
||||||
name=conv.name,
|
|
||||||
system_template=conv.system_template,
|
|
||||||
system_message=conv.system_message,
|
|
||||||
roles=conv.roles,
|
|
||||||
messages=list(conv.messages), # prevent in-place modification
|
|
||||||
offset=conv.offset,
|
|
||||||
sep_style=SeparatorStyle(conv.sep_style),
|
|
||||||
sep=conv.sep,
|
|
||||||
sep2=conv.sep2,
|
|
||||||
stop_str=conv.stop_str,
|
|
||||||
stop_token_ids=conv.stop_token_ids,
|
|
||||||
)
|
|
||||||
|
|
||||||
if isinstance(request.messages, str):
|
|
||||||
prompt = request.messages
|
|
||||||
else:
|
|
||||||
for message in request.messages:
|
|
||||||
msg_role = message["role"]
|
|
||||||
if msg_role == "system":
|
|
||||||
conv.system_message = message["content"]
|
|
||||||
elif msg_role == "user":
|
|
||||||
conv.append_message(conv.roles[0], message["content"])
|
|
||||||
elif msg_role == "assistant":
|
|
||||||
conv.append_message(conv.roles[1], message["content"])
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unknown role: {msg_role}")
|
|
||||||
|
|
||||||
# Add a blank message for the assistant.
|
|
||||||
conv.append_message(conv.roles[1], None)
|
|
||||||
prompt = conv.get_prompt()
|
|
||||||
|
|
||||||
return prompt
|
|
||||||
|
|
||||||
|
|
||||||
async def check_length(
|
async def check_length(
|
||||||
request: Union[ChatCompletionRequest, CompletionRequest],
|
request: Union[ChatCompletionRequest, CompletionRequest],
|
||||||
prompt: Optional[str] = None,
|
prompt: Optional[str] = None,
|
||||||
@@ -124,12 +138,12 @@ async def check_length(
|
|||||||
assert (not (prompt is None and prompt_ids is None)
|
assert (not (prompt is None and prompt_ids is None)
|
||||||
and not (prompt is not None and prompt_ids is not None)
|
and not (prompt is not None and prompt_ids is not None)
|
||||||
), "Either prompt or prompt_ids should be provided."
|
), "Either prompt or prompt_ids should be provided."
|
||||||
if prompt_ids is not None:
|
input_ids = prompt_ids if prompt_ids is not None else tokenizer(
|
||||||
input_ids = prompt_ids
|
prompt).input_ids
|
||||||
else:
|
|
||||||
input_ids = tokenizer(prompt).input_ids
|
|
||||||
token_num = len(input_ids)
|
token_num = len(input_ids)
|
||||||
|
|
||||||
|
if request.max_tokens is None:
|
||||||
|
request.max_tokens = max_model_len - token_num
|
||||||
if token_num + request.max_tokens > max_model_len:
|
if token_num + request.max_tokens > max_model_len:
|
||||||
return input_ids, create_error_response(
|
return input_ids, create_error_response(
|
||||||
HTTPStatus.BAD_REQUEST,
|
HTTPStatus.BAD_REQUEST,
|
||||||
@@ -143,6 +157,12 @@ async def check_length(
|
|||||||
return input_ids, None
|
return input_ids, None
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
async def health() -> Response:
|
||||||
|
"""Health check."""
|
||||||
|
return Response(status_code=200)
|
||||||
|
|
||||||
|
|
||||||
@app.get("/v1/models")
|
@app.get("/v1/models")
|
||||||
async def show_available_models():
|
async def show_available_models():
|
||||||
"""Show available models. Right now we only have one model."""
|
"""Show available models. Right now we only have one model."""
|
||||||
@@ -154,16 +174,26 @@ async def show_available_models():
|
|||||||
return ModelList(data=model_cards)
|
return ModelList(data=model_cards)
|
||||||
|
|
||||||
|
|
||||||
def create_logprobs(token_ids: List[int],
|
def create_logprobs(
|
||||||
id_logprobs: List[Dict[int, float]],
|
token_ids: List[int],
|
||||||
initial_text_offset: int = 0) -> LogProbs:
|
top_logprobs: Optional[List[Optional[Dict[int, float]]]] = None,
|
||||||
|
num_output_top_logprobs: Optional[int] = None,
|
||||||
|
initial_text_offset: int = 0,
|
||||||
|
) -> LogProbs:
|
||||||
"""Create OpenAI-style logprobs."""
|
"""Create OpenAI-style logprobs."""
|
||||||
logprobs = LogProbs()
|
logprobs = LogProbs()
|
||||||
last_token_len = 0
|
last_token_len = 0
|
||||||
for token_id, id_logprob in zip(token_ids, id_logprobs):
|
if num_output_top_logprobs:
|
||||||
|
logprobs.top_logprobs = []
|
||||||
|
for i, token_id in enumerate(token_ids):
|
||||||
|
step_top_logprobs = top_logprobs[i]
|
||||||
|
if step_top_logprobs is not None:
|
||||||
|
token_logprob = step_top_logprobs[token_id]
|
||||||
|
else:
|
||||||
|
token_logprob = None
|
||||||
token = tokenizer.convert_ids_to_tokens(token_id)
|
token = tokenizer.convert_ids_to_tokens(token_id)
|
||||||
logprobs.tokens.append(token)
|
logprobs.tokens.append(token)
|
||||||
logprobs.token_logprobs.append(id_logprob[token_id])
|
logprobs.token_logprobs.append(token_logprob)
|
||||||
if len(logprobs.text_offset) == 0:
|
if len(logprobs.text_offset) == 0:
|
||||||
logprobs.text_offset.append(initial_text_offset)
|
logprobs.text_offset.append(initial_text_offset)
|
||||||
else:
|
else:
|
||||||
@@ -171,10 +201,11 @@ def create_logprobs(token_ids: List[int],
|
|||||||
last_token_len)
|
last_token_len)
|
||||||
last_token_len = len(token)
|
last_token_len = len(token)
|
||||||
|
|
||||||
logprobs.top_logprobs.append({
|
if num_output_top_logprobs:
|
||||||
tokenizer.convert_ids_to_tokens(i): p
|
logprobs.top_logprobs.append({
|
||||||
for i, p in id_logprob.items()
|
tokenizer.convert_ids_to_tokens(i): p
|
||||||
})
|
for i, p in step_top_logprobs.items()
|
||||||
|
} if step_top_logprobs else None)
|
||||||
return logprobs
|
return logprobs
|
||||||
|
|
||||||
|
|
||||||
@@ -190,29 +221,34 @@ async def create_chat_completion(request: ChatCompletionRequest,
|
|||||||
- function_call (Users should implement this by themselves)
|
- function_call (Users should implement this by themselves)
|
||||||
- logit_bias (to be supported by vLLM engine)
|
- logit_bias (to be supported by vLLM engine)
|
||||||
"""
|
"""
|
||||||
logger.info(f"Received chat completion request: {request}")
|
|
||||||
|
|
||||||
if not engine.is_running:
|
|
||||||
engine.start_background_loop()
|
|
||||||
|
|
||||||
error_check_ret = await check_model(request)
|
error_check_ret = await check_model(request)
|
||||||
if error_check_ret is not None:
|
if error_check_ret is not None:
|
||||||
return error_check_ret
|
return error_check_ret
|
||||||
|
|
||||||
if request.logit_bias is not None:
|
if request.logit_bias is not None and len(request.logit_bias) > 0:
|
||||||
# TODO: support logit_bias in vLLM engine.
|
# TODO: support logit_bias in vLLM engine.
|
||||||
return create_error_response(HTTPStatus.BAD_REQUEST,
|
return create_error_response(HTTPStatus.BAD_REQUEST,
|
||||||
"logit_bias is not currently supported")
|
"logit_bias is not currently supported")
|
||||||
|
|
||||||
prompt = await get_gen_prompt(request)
|
try:
|
||||||
|
prompt = tokenizer.apply_chat_template(
|
||||||
|
conversation=request.messages,
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=request.add_generation_prompt)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in applying chat template from request: {str(e)}")
|
||||||
|
return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
|
||||||
|
|
||||||
token_ids, error_check_ret = await check_length(request, prompt=prompt)
|
token_ids, error_check_ret = await check_length(request, prompt=prompt)
|
||||||
if error_check_ret is not None:
|
if error_check_ret is not None:
|
||||||
return error_check_ret
|
return error_check_ret
|
||||||
|
|
||||||
model_name = request.model
|
model_name = request.model
|
||||||
request_id = f"cmpl-{random_uuid()}"
|
request_id = f"cmpl-{random_uuid()}"
|
||||||
created_time = int(time.time())
|
created_time = int(time.monotonic())
|
||||||
|
chunk_object_type = "chat.completion.chunk"
|
||||||
try:
|
try:
|
||||||
|
spaces_between_special_tokens = request.spaces_between_special_tokens
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
n=request.n,
|
n=request.n,
|
||||||
presence_penalty=request.presence_penalty,
|
presence_penalty=request.presence_penalty,
|
||||||
@@ -220,11 +256,14 @@ async def create_chat_completion(request: ChatCompletionRequest,
|
|||||||
temperature=request.temperature,
|
temperature=request.temperature,
|
||||||
top_p=request.top_p,
|
top_p=request.top_p,
|
||||||
stop=request.stop,
|
stop=request.stop,
|
||||||
|
stop_token_ids=request.stop_token_ids,
|
||||||
max_tokens=request.max_tokens,
|
max_tokens=request.max_tokens,
|
||||||
best_of=request.best_of,
|
best_of=request.best_of,
|
||||||
top_k=request.top_k,
|
top_k=request.top_k,
|
||||||
ignore_eos=request.ignore_eos,
|
ignore_eos=request.ignore_eos,
|
||||||
use_beam_search=request.use_beam_search,
|
use_beam_search=request.use_beam_search,
|
||||||
|
skip_special_tokens=request.skip_special_tokens,
|
||||||
|
spaces_between_special_tokens=spaces_between_special_tokens,
|
||||||
)
|
)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
|
return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
|
||||||
@@ -232,123 +271,162 @@ async def create_chat_completion(request: ChatCompletionRequest,
|
|||||||
result_generator = engine.generate(prompt, sampling_params, request_id,
|
result_generator = engine.generate(prompt, sampling_params, request_id,
|
||||||
token_ids)
|
token_ids)
|
||||||
|
|
||||||
async def abort_request() -> None:
|
def get_role() -> str:
|
||||||
await engine.abort(request_id)
|
if request.add_generation_prompt:
|
||||||
|
return response_role
|
||||||
def create_stream_response_json(
|
else:
|
||||||
index: int,
|
return request.messages[-1]["role"]
|
||||||
text: str,
|
|
||||||
finish_reason: Optional[str] = None,
|
|
||||||
) -> str:
|
|
||||||
choice_data = ChatCompletionResponseStreamChoice(
|
|
||||||
index=index,
|
|
||||||
delta=DeltaMessage(content=text),
|
|
||||||
finish_reason=finish_reason,
|
|
||||||
)
|
|
||||||
response = ChatCompletionStreamResponse(
|
|
||||||
id=request_id,
|
|
||||||
created=created_time,
|
|
||||||
model=model_name,
|
|
||||||
choices=[choice_data],
|
|
||||||
)
|
|
||||||
response_json = response.json(ensure_ascii=False)
|
|
||||||
|
|
||||||
return response_json
|
|
||||||
|
|
||||||
async def completion_stream_generator() -> AsyncGenerator[str, None]:
|
async def completion_stream_generator() -> AsyncGenerator[str, None]:
|
||||||
# First chunk with role
|
# Send first response for each request.n (index) with the role
|
||||||
|
role = get_role()
|
||||||
for i in range(request.n):
|
for i in range(request.n):
|
||||||
choice_data = ChatCompletionResponseStreamChoice(
|
choice_data = ChatCompletionResponseStreamChoice(
|
||||||
index=i,
|
index=i, delta=DeltaMessage(role=role), finish_reason=None)
|
||||||
delta=DeltaMessage(role="assistant"),
|
|
||||||
finish_reason=None,
|
|
||||||
)
|
|
||||||
chunk = ChatCompletionStreamResponse(id=request_id,
|
chunk = ChatCompletionStreamResponse(id=request_id,
|
||||||
|
object=chunk_object_type,
|
||||||
|
created=created_time,
|
||||||
choices=[choice_data],
|
choices=[choice_data],
|
||||||
model=model_name)
|
model=model_name)
|
||||||
data = chunk.json(exclude_unset=True, ensure_ascii=False)
|
data = chunk.json(exclude_unset=True, ensure_ascii=False)
|
||||||
yield f"data: {data}\n\n"
|
yield f"data: {data}\n\n"
|
||||||
|
|
||||||
|
# Send response to echo the input portion of the last message
|
||||||
|
if request.echo:
|
||||||
|
last_msg_content = ""
|
||||||
|
if request.messages and isinstance(
|
||||||
|
request.messages, list) and request.messages[-1].get(
|
||||||
|
"content") and request.messages[-1].get(
|
||||||
|
"role") == role:
|
||||||
|
last_msg_content = request.messages[-1]["content"]
|
||||||
|
if last_msg_content:
|
||||||
|
for i in range(request.n):
|
||||||
|
choice_data = ChatCompletionResponseStreamChoice(
|
||||||
|
index=i,
|
||||||
|
delta=DeltaMessage(content=last_msg_content),
|
||||||
|
finish_reason=None)
|
||||||
|
chunk = ChatCompletionStreamResponse(
|
||||||
|
id=request_id,
|
||||||
|
object=chunk_object_type,
|
||||||
|
created=created_time,
|
||||||
|
choices=[choice_data],
|
||||||
|
model=model_name)
|
||||||
|
data = chunk.json(exclude_unset=True, ensure_ascii=False)
|
||||||
|
yield f"data: {data}\n\n"
|
||||||
|
|
||||||
|
# Send response for each token for each request.n (index)
|
||||||
previous_texts = [""] * request.n
|
previous_texts = [""] * request.n
|
||||||
previous_num_tokens = [0] * request.n
|
previous_num_tokens = [0] * request.n
|
||||||
|
finish_reason_sent = [False] * request.n
|
||||||
async for res in result_generator:
|
async for res in result_generator:
|
||||||
res: RequestOutput
|
res: RequestOutput
|
||||||
for output in res.outputs:
|
for output in res.outputs:
|
||||||
i = output.index
|
i = output.index
|
||||||
delta_text = output.text[len(previous_texts[i]):]
|
|
||||||
previous_texts[i] = output.text
|
if finish_reason_sent[i]:
|
||||||
previous_num_tokens[i] = len(output.token_ids)
|
continue
|
||||||
response_json = create_stream_response_json(
|
|
||||||
index=i,
|
if output.finish_reason is None:
|
||||||
text=delta_text,
|
# Send token-by-token response for each request.n
|
||||||
)
|
delta_text = output.text[len(previous_texts[i]):]
|
||||||
yield f"data: {response_json}\n\n"
|
previous_texts[i] = output.text
|
||||||
if output.finish_reason is not None:
|
completion_tokens = len(output.token_ids)
|
||||||
response_json = create_stream_response_json(
|
previous_num_tokens[i] = completion_tokens
|
||||||
|
choice_data = ChatCompletionResponseStreamChoice(
|
||||||
index=i,
|
index=i,
|
||||||
text="",
|
delta=DeltaMessage(content=delta_text),
|
||||||
finish_reason=output.finish_reason,
|
finish_reason=None)
|
||||||
|
chunk = ChatCompletionStreamResponse(
|
||||||
|
id=request_id,
|
||||||
|
object=chunk_object_type,
|
||||||
|
created=created_time,
|
||||||
|
choices=[choice_data],
|
||||||
|
model=model_name)
|
||||||
|
data = chunk.json(exclude_unset=True, ensure_ascii=False)
|
||||||
|
yield f"data: {data}\n\n"
|
||||||
|
else:
|
||||||
|
# Send the finish response for each request.n only once
|
||||||
|
prompt_tokens = len(res.prompt_token_ids)
|
||||||
|
final_usage = UsageInfo(
|
||||||
|
prompt_tokens=prompt_tokens,
|
||||||
|
completion_tokens=completion_tokens,
|
||||||
|
total_tokens=prompt_tokens + completion_tokens,
|
||||||
)
|
)
|
||||||
yield f"data: {response_json}\n\n"
|
choice_data = ChatCompletionResponseStreamChoice(
|
||||||
|
index=i, delta=[], finish_reason=output.finish_reason)
|
||||||
|
chunk = ChatCompletionStreamResponse(
|
||||||
|
id=request_id,
|
||||||
|
object=chunk_object_type,
|
||||||
|
created=created_time,
|
||||||
|
choices=[choice_data],
|
||||||
|
model=model_name)
|
||||||
|
if final_usage is not None:
|
||||||
|
chunk.usage = final_usage
|
||||||
|
data = chunk.json(exclude_unset=True,
|
||||||
|
exclude_none=True,
|
||||||
|
ensure_ascii=False)
|
||||||
|
yield f"data: {data}\n\n"
|
||||||
|
finish_reason_sent[i] = True
|
||||||
|
# Send the final done message after all response.n are finished
|
||||||
yield "data: [DONE]\n\n"
|
yield "data: [DONE]\n\n"
|
||||||
|
|
||||||
|
async def completion_full_generator():
|
||||||
|
final_res: RequestOutput = None
|
||||||
|
async for res in result_generator:
|
||||||
|
if await raw_request.is_disconnected():
|
||||||
|
# Abort the request if the client disconnects.
|
||||||
|
await engine.abort(request_id)
|
||||||
|
return create_error_response(HTTPStatus.BAD_REQUEST,
|
||||||
|
"Client disconnected")
|
||||||
|
final_res = res
|
||||||
|
assert final_res is not None
|
||||||
|
|
||||||
|
choices = []
|
||||||
|
role = get_role()
|
||||||
|
for output in final_res.outputs:
|
||||||
|
choice_data = ChatCompletionResponseChoice(
|
||||||
|
index=output.index,
|
||||||
|
message=ChatMessage(role=role, content=output.text),
|
||||||
|
finish_reason=output.finish_reason,
|
||||||
|
)
|
||||||
|
choices.append(choice_data)
|
||||||
|
|
||||||
|
if request.echo:
|
||||||
|
last_msg_content = ""
|
||||||
|
if request.messages and isinstance(
|
||||||
|
request.messages, list) and request.messages[-1].get(
|
||||||
|
"content") and request.messages[-1].get(
|
||||||
|
"role") == role:
|
||||||
|
last_msg_content = request.messages[-1]["content"]
|
||||||
|
|
||||||
|
for choice in choices:
|
||||||
|
full_message = last_msg_content + choice.message.content
|
||||||
|
choice.message.content = full_message
|
||||||
|
|
||||||
|
num_prompt_tokens = len(final_res.prompt_token_ids)
|
||||||
|
num_generated_tokens = sum(
|
||||||
|
len(output.token_ids) for output in final_res.outputs)
|
||||||
|
usage = UsageInfo(
|
||||||
|
prompt_tokens=num_prompt_tokens,
|
||||||
|
completion_tokens=num_generated_tokens,
|
||||||
|
total_tokens=num_prompt_tokens + num_generated_tokens,
|
||||||
|
)
|
||||||
|
response = ChatCompletionResponse(
|
||||||
|
id=request_id,
|
||||||
|
created=created_time,
|
||||||
|
model=model_name,
|
||||||
|
choices=choices,
|
||||||
|
usage=usage,
|
||||||
|
)
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
# Streaming response
|
# Streaming response
|
||||||
if request.stream:
|
if request.stream:
|
||||||
background_tasks = BackgroundTasks()
|
|
||||||
# Abort the request if the client disconnects.
|
|
||||||
background_tasks.add_task(abort_request)
|
|
||||||
return StreamingResponse(completion_stream_generator(),
|
return StreamingResponse(completion_stream_generator(),
|
||||||
media_type="text/event-stream",
|
|
||||||
background=background_tasks)
|
|
||||||
|
|
||||||
# Non-streaming response
|
|
||||||
final_res: RequestOutput = None
|
|
||||||
async for res in result_generator:
|
|
||||||
if await raw_request.is_disconnected():
|
|
||||||
# Abort the request if the client disconnects.
|
|
||||||
await abort_request()
|
|
||||||
return create_error_response(HTTPStatus.BAD_REQUEST,
|
|
||||||
"Client disconnected")
|
|
||||||
final_res = res
|
|
||||||
assert final_res is not None
|
|
||||||
choices = []
|
|
||||||
for output in final_res.outputs:
|
|
||||||
choice_data = ChatCompletionResponseChoice(
|
|
||||||
index=output.index,
|
|
||||||
message=ChatMessage(role="assistant", content=output.text),
|
|
||||||
finish_reason=output.finish_reason,
|
|
||||||
)
|
|
||||||
choices.append(choice_data)
|
|
||||||
|
|
||||||
num_prompt_tokens = len(final_res.prompt_token_ids)
|
|
||||||
num_generated_tokens = sum(
|
|
||||||
len(output.token_ids) for output in final_res.outputs)
|
|
||||||
usage = UsageInfo(
|
|
||||||
prompt_tokens=num_prompt_tokens,
|
|
||||||
completion_tokens=num_generated_tokens,
|
|
||||||
total_tokens=num_prompt_tokens + num_generated_tokens,
|
|
||||||
)
|
|
||||||
response = ChatCompletionResponse(
|
|
||||||
id=request_id,
|
|
||||||
created=created_time,
|
|
||||||
model=model_name,
|
|
||||||
choices=choices,
|
|
||||||
usage=usage,
|
|
||||||
)
|
|
||||||
|
|
||||||
if request.stream:
|
|
||||||
# When user requests streaming but we don't stream, we still need to
|
|
||||||
# return a streaming response with a single event.
|
|
||||||
response_json = response.json(ensure_ascii=False)
|
|
||||||
|
|
||||||
async def fake_stream_generator() -> AsyncGenerator[str, None]:
|
|
||||||
yield f"data: {response_json}\n\n"
|
|
||||||
yield "data: [DONE]\n\n"
|
|
||||||
|
|
||||||
return StreamingResponse(fake_stream_generator(),
|
|
||||||
media_type="text/event-stream")
|
media_type="text/event-stream")
|
||||||
|
else:
|
||||||
return response
|
return await completion_full_generator()
|
||||||
|
|
||||||
|
|
||||||
@app.post("/v1/completions")
|
@app.post("/v1/completions")
|
||||||
@@ -359,33 +437,24 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
|
|||||||
for the API specification. This API mimics the OpenAI Completion API.
|
for the API specification. This API mimics the OpenAI Completion API.
|
||||||
|
|
||||||
NOTE: Currently we do not support the following features:
|
NOTE: Currently we do not support the following features:
|
||||||
- echo (since the vLLM engine does not currently support
|
|
||||||
getting the logprobs of prompt tokens)
|
|
||||||
- suffix (the language models we currently support do not support
|
- suffix (the language models we currently support do not support
|
||||||
suffix)
|
suffix)
|
||||||
- logit_bias (to be supported by vLLM engine)
|
- logit_bias (to be supported by vLLM engine)
|
||||||
"""
|
"""
|
||||||
logger.info(f"Received completion request: {request}")
|
|
||||||
|
|
||||||
if not engine.is_running:
|
|
||||||
engine.start_background_loop()
|
|
||||||
|
|
||||||
error_check_ret = await check_model(request)
|
error_check_ret = await check_model(request)
|
||||||
if error_check_ret is not None:
|
if error_check_ret is not None:
|
||||||
return error_check_ret
|
return error_check_ret
|
||||||
|
|
||||||
if request.echo:
|
# OpenAI API supports echoing the prompt when max_tokens is 0.
|
||||||
# We do not support echo since the vLLM engine does not
|
echo_without_generation = request.echo and request.max_tokens == 0
|
||||||
# currently support getting the logprobs of prompt tokens.
|
|
||||||
return create_error_response(HTTPStatus.BAD_REQUEST,
|
|
||||||
"echo is not currently supported")
|
|
||||||
|
|
||||||
if request.suffix is not None:
|
if request.suffix is not None:
|
||||||
# The language models we currently support do not support suffix.
|
# The language models we currently support do not support suffix.
|
||||||
return create_error_response(HTTPStatus.BAD_REQUEST,
|
return create_error_response(HTTPStatus.BAD_REQUEST,
|
||||||
"suffix is not currently supported")
|
"suffix is not currently supported")
|
||||||
|
|
||||||
if request.logit_bias is not None:
|
if request.logit_bias is not None and len(request.logit_bias) > 0:
|
||||||
# TODO: support logit_bias in vLLM engine.
|
# TODO: support logit_bias in vLLM engine.
|
||||||
return create_error_response(HTTPStatus.BAD_REQUEST,
|
return create_error_response(HTTPStatus.BAD_REQUEST,
|
||||||
"logit_bias is not currently supported")
|
"logit_bias is not currently supported")
|
||||||
@@ -420,8 +489,9 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
|
|||||||
if error_check_ret is not None:
|
if error_check_ret is not None:
|
||||||
return error_check_ret
|
return error_check_ret
|
||||||
|
|
||||||
created_time = int(time.time())
|
created_time = int(time.monotonic())
|
||||||
try:
|
try:
|
||||||
|
spaces_between_special_tokens = request.spaces_between_special_tokens
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
n=request.n,
|
n=request.n,
|
||||||
best_of=request.best_of,
|
best_of=request.best_of,
|
||||||
@@ -431,10 +501,15 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
|
|||||||
top_p=request.top_p,
|
top_p=request.top_p,
|
||||||
top_k=request.top_k,
|
top_k=request.top_k,
|
||||||
stop=request.stop,
|
stop=request.stop,
|
||||||
|
stop_token_ids=request.stop_token_ids,
|
||||||
ignore_eos=request.ignore_eos,
|
ignore_eos=request.ignore_eos,
|
||||||
max_tokens=request.max_tokens,
|
max_tokens=request.max_tokens
|
||||||
|
if not echo_without_generation else 1,
|
||||||
logprobs=request.logprobs,
|
logprobs=request.logprobs,
|
||||||
use_beam_search=request.use_beam_search,
|
use_beam_search=request.use_beam_search,
|
||||||
|
prompt_logprobs=request.logprobs if request.echo else None,
|
||||||
|
skip_special_tokens=request.skip_special_tokens,
|
||||||
|
spaces_between_special_tokens=spaces_between_special_tokens,
|
||||||
)
|
)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
|
return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
|
||||||
@@ -454,14 +529,12 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
|
|||||||
and (request.best_of is None or request.n == request.best_of)
|
and (request.best_of is None or request.n == request.best_of)
|
||||||
and not request.use_beam_search)
|
and not request.use_beam_search)
|
||||||
|
|
||||||
async def abort_request() -> None:
|
|
||||||
await engine.abort(request_id)
|
|
||||||
|
|
||||||
def create_stream_response_json(
|
def create_stream_response_json(
|
||||||
index: int,
|
index: int,
|
||||||
text: str,
|
text: str,
|
||||||
logprobs: Optional[LogProbs] = None,
|
logprobs: Optional[LogProbs] = None,
|
||||||
finish_reason: Optional[str] = None,
|
finish_reason: Optional[str] = None,
|
||||||
|
usage: Optional[UsageInfo] = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
choice_data = CompletionResponseStreamChoice(
|
choice_data = CompletionResponseStreamChoice(
|
||||||
index=index,
|
index=index,
|
||||||
@@ -475,73 +548,119 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
|
|||||||
model=model_name,
|
model=model_name,
|
||||||
choices=[choice_data],
|
choices=[choice_data],
|
||||||
)
|
)
|
||||||
response_json = response.json(ensure_ascii=False)
|
if usage is not None:
|
||||||
|
response.usage = usage
|
||||||
|
response_json = response.json(exclude_unset=True, ensure_ascii=False)
|
||||||
|
|
||||||
return response_json
|
return response_json
|
||||||
|
|
||||||
async def completion_stream_generator() -> AsyncGenerator[str, None]:
|
async def completion_stream_generator() -> AsyncGenerator[str, None]:
|
||||||
previous_texts = [""] * request.n
|
previous_texts = [""] * request.n
|
||||||
previous_num_tokens = [0] * request.n
|
previous_num_tokens = [0] * request.n
|
||||||
|
has_echoed = [False] * request.n
|
||||||
async for res in result_generator:
|
async for res in result_generator:
|
||||||
res: RequestOutput
|
res: RequestOutput
|
||||||
for output in res.outputs:
|
for output in res.outputs:
|
||||||
i = output.index
|
i = output.index
|
||||||
delta_text = output.text[len(previous_texts[i]):]
|
delta_text = output.text[len(previous_texts[i]):]
|
||||||
|
token_ids = output.token_ids[previous_num_tokens[i]:]
|
||||||
|
top_logprobs = output.logprobs[previous_num_tokens[i]:]
|
||||||
|
offsets = len(previous_texts[i])
|
||||||
|
if request.echo and not has_echoed[i]:
|
||||||
|
if not echo_without_generation:
|
||||||
|
delta_text = res.prompt + delta_text
|
||||||
|
token_ids = res.prompt_token_ids + token_ids
|
||||||
|
top_logprobs = res.prompt_logprobs + top_logprobs
|
||||||
|
else:
|
||||||
|
delta_text = res.prompt
|
||||||
|
token_ids = res.prompt_token_ids
|
||||||
|
top_logprobs = res.prompt_logprobs
|
||||||
|
has_echoed[i] = True
|
||||||
if request.logprobs is not None:
|
if request.logprobs is not None:
|
||||||
logprobs = create_logprobs(
|
logprobs = create_logprobs(
|
||||||
output.token_ids[previous_num_tokens[i]:],
|
token_ids=token_ids,
|
||||||
output.logprobs[previous_num_tokens[i]:],
|
top_logprobs=top_logprobs,
|
||||||
len(previous_texts[i]))
|
num_output_top_logprobs=request.logprobs,
|
||||||
|
initial_text_offset=offsets,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
logprobs = None
|
logprobs = None
|
||||||
previous_texts[i] = output.text
|
previous_texts[i] = output.text
|
||||||
previous_num_tokens[i] = len(output.token_ids)
|
previous_num_tokens[i] = len(output.token_ids)
|
||||||
|
finish_reason = output.finish_reason
|
||||||
response_json = create_stream_response_json(
|
response_json = create_stream_response_json(
|
||||||
index=i,
|
index=i,
|
||||||
text=delta_text,
|
text=delta_text,
|
||||||
logprobs=logprobs,
|
logprobs=logprobs,
|
||||||
|
finish_reason=finish_reason,
|
||||||
)
|
)
|
||||||
yield f"data: {response_json}\n\n"
|
yield f"data: {response_json}\n\n"
|
||||||
if output.finish_reason is not None:
|
if output.finish_reason is not None:
|
||||||
logprobs = (LogProbs()
|
logprobs = (LogProbs()
|
||||||
if request.logprobs is not None else None)
|
if request.logprobs is not None else None)
|
||||||
|
prompt_tokens = len(res.prompt_token_ids)
|
||||||
|
completion_tokens = len(output.token_ids)
|
||||||
|
final_usage = UsageInfo(
|
||||||
|
prompt_tokens=prompt_tokens,
|
||||||
|
completion_tokens=completion_tokens,
|
||||||
|
total_tokens=prompt_tokens + completion_tokens,
|
||||||
|
)
|
||||||
response_json = create_stream_response_json(
|
response_json = create_stream_response_json(
|
||||||
index=i,
|
index=i,
|
||||||
text="",
|
text="",
|
||||||
logprobs=logprobs,
|
logprobs=logprobs,
|
||||||
finish_reason=output.finish_reason,
|
finish_reason=output.finish_reason,
|
||||||
|
usage=final_usage,
|
||||||
)
|
)
|
||||||
yield f"data: {response_json}\n\n"
|
yield f"data: {response_json}\n\n"
|
||||||
yield "data: [DONE]\n\n"
|
yield "data: [DONE]\n\n"
|
||||||
|
|
||||||
# Streaming response
|
# Streaming response
|
||||||
if stream:
|
if stream:
|
||||||
background_tasks = BackgroundTasks()
|
|
||||||
# Abort the request if the client disconnects.
|
|
||||||
background_tasks.add_task(abort_request)
|
|
||||||
return StreamingResponse(completion_stream_generator(),
|
return StreamingResponse(completion_stream_generator(),
|
||||||
media_type="text/event-stream",
|
media_type="text/event-stream")
|
||||||
background=background_tasks)
|
|
||||||
|
|
||||||
# Non-streaming response
|
# Non-streaming response
|
||||||
final_res: RequestOutput = None
|
final_res: RequestOutput = None
|
||||||
async for res in result_generator:
|
async for res in result_generator:
|
||||||
if await raw_request.is_disconnected():
|
if await raw_request.is_disconnected():
|
||||||
# Abort the request if the client disconnects.
|
# Abort the request if the client disconnects.
|
||||||
await abort_request()
|
await engine.abort(request_id)
|
||||||
return create_error_response(HTTPStatus.BAD_REQUEST,
|
return create_error_response(HTTPStatus.BAD_REQUEST,
|
||||||
"Client disconnected")
|
"Client disconnected")
|
||||||
final_res = res
|
final_res = res
|
||||||
assert final_res is not None
|
assert final_res is not None
|
||||||
choices = []
|
choices = []
|
||||||
|
prompt_token_ids = final_res.prompt_token_ids
|
||||||
|
prompt_logprobs = final_res.prompt_logprobs
|
||||||
|
prompt_text = final_res.prompt
|
||||||
for output in final_res.outputs:
|
for output in final_res.outputs:
|
||||||
if request.logprobs is not None:
|
if request.logprobs is not None:
|
||||||
logprobs = create_logprobs(output.token_ids, output.logprobs)
|
if not echo_without_generation:
|
||||||
|
token_ids = output.token_ids
|
||||||
|
top_logprobs = output.logprobs
|
||||||
|
if request.echo:
|
||||||
|
token_ids = prompt_token_ids + token_ids
|
||||||
|
top_logprobs = prompt_logprobs + top_logprobs
|
||||||
|
else:
|
||||||
|
token_ids = prompt_token_ids
|
||||||
|
top_logprobs = prompt_logprobs
|
||||||
|
logprobs = create_logprobs(
|
||||||
|
token_ids=token_ids,
|
||||||
|
top_logprobs=top_logprobs,
|
||||||
|
num_output_top_logprobs=request.logprobs,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
logprobs = None
|
logprobs = None
|
||||||
|
if not echo_without_generation:
|
||||||
|
output_text = output.text
|
||||||
|
if request.echo:
|
||||||
|
output_text = prompt_text + output_text
|
||||||
|
else:
|
||||||
|
output_text = prompt_text
|
||||||
choice_data = CompletionResponseChoice(
|
choice_data = CompletionResponseChoice(
|
||||||
index=output.index,
|
index=output.index,
|
||||||
text=output.text,
|
text=output_text,
|
||||||
logprobs=logprobs,
|
logprobs=logprobs,
|
||||||
finish_reason=output.finish_reason,
|
finish_reason=output.finish_reason,
|
||||||
)
|
)
|
||||||
@@ -579,37 +698,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
args = parse_args()
|
||||||
description="vLLM OpenAI-Compatible RESTful API server.")
|
|
||||||
parser.add_argument("--host",
|
|
||||||
type=str,
|
|
||||||
default="localhost",
|
|
||||||
help="host name")
|
|
||||||
parser.add_argument("--port", type=int, default=8000, help="port number")
|
|
||||||
parser.add_argument("--allow-credentials",
|
|
||||||
action="store_true",
|
|
||||||
help="allow credentials")
|
|
||||||
parser.add_argument("--allowed-origins",
|
|
||||||
type=json.loads,
|
|
||||||
default=["*"],
|
|
||||||
help="allowed origins")
|
|
||||||
parser.add_argument("--allowed-methods",
|
|
||||||
type=json.loads,
|
|
||||||
default=["*"],
|
|
||||||
help="allowed methods")
|
|
||||||
parser.add_argument("--allowed-headers",
|
|
||||||
type=json.loads,
|
|
||||||
default=["*"],
|
|
||||||
help="allowed headers")
|
|
||||||
parser.add_argument("--served-model-name",
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help="The model name used in the API. If not "
|
|
||||||
"specified, the model name will be the same as "
|
|
||||||
"the huggingface name.")
|
|
||||||
|
|
||||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
app.add_middleware(
|
app.add_middleware(
|
||||||
CORSMiddleware,
|
CORSMiddleware,
|
||||||
@@ -626,16 +715,22 @@ if __name__ == "__main__":
|
|||||||
else:
|
else:
|
||||||
served_model = args.model
|
served_model = args.model
|
||||||
|
|
||||||
|
response_role = args.response_role
|
||||||
|
|
||||||
engine_args = AsyncEngineArgs.from_cli_args(args)
|
engine_args = AsyncEngineArgs.from_cli_args(args)
|
||||||
engine = AsyncLLMEngine.from_engine_args(engine_args,
|
engine = AsyncLLMEngine.from_engine_args(engine_args)
|
||||||
start_engine_loop=False)
|
|
||||||
engine_model_config = asyncio.run(engine.get_model_config())
|
engine_model_config = asyncio.run(engine.get_model_config())
|
||||||
max_model_len = engine_model_config.get_max_model_len()
|
max_model_len = engine_model_config.max_model_len
|
||||||
|
|
||||||
# A separate tokenizer to map token IDs to strings.
|
# A separate tokenizer to map token IDs to strings.
|
||||||
tokenizer = get_tokenizer(engine_args.tokenizer,
|
tokenizer = get_tokenizer(
|
||||||
tokenizer_mode=engine_args.tokenizer_mode,
|
engine_model_config.tokenizer,
|
||||||
trust_remote_code=engine_args.trust_remote_code)
|
tokenizer_mode=engine_model_config.tokenizer_mode,
|
||||||
|
trust_remote_code=engine_model_config.trust_remote_code)
|
||||||
|
load_chat_template(args, tokenizer)
|
||||||
|
|
||||||
|
# Register labels for metrics
|
||||||
|
add_global_metrics_labels(model_name=engine_args.model)
|
||||||
|
|
||||||
uvicorn.run(app,
|
uvicorn.run(app,
|
||||||
host=args.host,
|
host=args.host,
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ class ChatCompletionRequest(BaseModel):
|
|||||||
temperature: Optional[float] = 0.7
|
temperature: Optional[float] = 0.7
|
||||||
top_p: Optional[float] = 1.0
|
top_p: Optional[float] = 1.0
|
||||||
n: Optional[int] = 1
|
n: Optional[int] = 1
|
||||||
max_tokens: Optional[int] = 16
|
max_tokens: Optional[int] = None
|
||||||
stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
|
stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
|
||||||
stream: Optional[bool] = False
|
stream: Optional[bool] = False
|
||||||
presence_penalty: Optional[float] = 0.0
|
presence_penalty: Optional[float] = 0.0
|
||||||
@@ -70,6 +70,11 @@ class ChatCompletionRequest(BaseModel):
|
|||||||
top_k: Optional[int] = -1
|
top_k: Optional[int] = -1
|
||||||
ignore_eos: Optional[bool] = False
|
ignore_eos: Optional[bool] = False
|
||||||
use_beam_search: Optional[bool] = False
|
use_beam_search: Optional[bool] = False
|
||||||
|
stop_token_ids: Optional[List[int]] = Field(default_factory=list)
|
||||||
|
skip_special_tokens: Optional[bool] = True
|
||||||
|
spaces_between_special_tokens: Optional[bool] = True
|
||||||
|
add_generation_prompt: Optional[bool] = True
|
||||||
|
echo: Optional[bool] = False
|
||||||
|
|
||||||
|
|
||||||
class CompletionRequest(BaseModel):
|
class CompletionRequest(BaseModel):
|
||||||
@@ -94,14 +99,16 @@ class CompletionRequest(BaseModel):
|
|||||||
top_k: Optional[int] = -1
|
top_k: Optional[int] = -1
|
||||||
ignore_eos: Optional[bool] = False
|
ignore_eos: Optional[bool] = False
|
||||||
use_beam_search: Optional[bool] = False
|
use_beam_search: Optional[bool] = False
|
||||||
|
stop_token_ids: Optional[List[int]] = Field(default_factory=list)
|
||||||
|
skip_special_tokens: Optional[bool] = True
|
||||||
|
spaces_between_special_tokens: Optional[bool] = True
|
||||||
|
|
||||||
|
|
||||||
class LogProbs(BaseModel):
|
class LogProbs(BaseModel):
|
||||||
text_offset: List[int] = Field(default_factory=list)
|
text_offset: List[int] = Field(default_factory=list)
|
||||||
token_logprobs: List[Optional[float]] = Field(default_factory=list)
|
token_logprobs: List[Optional[float]] = Field(default_factory=list)
|
||||||
tokens: List[str] = Field(default_factory=list)
|
tokens: List[str] = Field(default_factory=list)
|
||||||
top_logprobs: List[Optional[Dict[str,
|
top_logprobs: Optional[List[Optional[Dict[int, float]]]] = None
|
||||||
float]]] = Field(default_factory=list)
|
|
||||||
|
|
||||||
|
|
||||||
class CompletionResponseChoice(BaseModel):
|
class CompletionResponseChoice(BaseModel):
|
||||||
@@ -133,6 +140,7 @@ class CompletionStreamResponse(BaseModel):
|
|||||||
created: int = Field(default_factory=lambda: int(time.time()))
|
created: int = Field(default_factory=lambda: int(time.time()))
|
||||||
model: str
|
model: str
|
||||||
choices: List[CompletionResponseStreamChoice]
|
choices: List[CompletionResponseStreamChoice]
|
||||||
|
usage: Optional[UsageInfo]
|
||||||
|
|
||||||
|
|
||||||
class ChatMessage(BaseModel):
|
class ChatMessage(BaseModel):
|
||||||
@@ -172,3 +180,5 @@ class ChatCompletionStreamResponse(BaseModel):
|
|||||||
created: int = Field(default_factory=lambda: int(time.time()))
|
created: int = Field(default_factory=lambda: int(time.time()))
|
||||||
model: str
|
model: str
|
||||||
choices: List[ChatCompletionResponseStreamChoice]
|
choices: List[ChatCompletionResponseStreamChoice]
|
||||||
|
usage: Optional[UsageInfo] = Field(
|
||||||
|
default=None, description="data about request and response")
|
||||||
|
|||||||
@@ -48,4 +48,9 @@ _setup_logger()
|
|||||||
|
|
||||||
|
|
||||||
def init_logger(name: str):
|
def init_logger(name: str):
|
||||||
return logging.getLogger(name)
|
# Use the same settings as above for root logger
|
||||||
|
logger = logging.getLogger(name)
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
logger.addHandler(_default_handler)
|
||||||
|
logger.propagate = False
|
||||||
|
return logger
|
||||||
|
|||||||
@@ -1,9 +1,11 @@
|
|||||||
from vllm.model_executor.input_metadata import InputMetadata
|
from vllm.model_executor.input_metadata import InputMetadata
|
||||||
from vllm.model_executor.model_loader import get_model
|
from vllm.model_executor.model_loader import get_model
|
||||||
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.model_executor.utils import set_random_seed
|
from vllm.model_executor.utils import set_random_seed
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"InputMetadata",
|
"InputMetadata",
|
||||||
"get_model",
|
"get_model",
|
||||||
|
"SamplingMetadata",
|
||||||
"set_random_seed",
|
"set_random_seed",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -1,67 +1,42 @@
|
|||||||
from typing import Dict, List, Tuple
|
from typing import List, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from xformers.ops import AttentionBias
|
|
||||||
|
|
||||||
from vllm.sampling_params import SamplingParams
|
|
||||||
from vllm.sequence import SequenceData
|
|
||||||
|
|
||||||
|
|
||||||
class InputMetadata:
|
class InputMetadata:
|
||||||
"""Metadata for input sequences. Used for PagedAttention.
|
"""Metadata for input sequences. Used in PagedAttention.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
seq_groups: List of (seq_ids, sampling_params).
|
|
||||||
seq_data: Seq_id -> SequenceData.
|
|
||||||
prompt_lens: Lengths of prompts.
|
prompt_lens: Lengths of prompts.
|
||||||
slot_mapping: The address to write the new KV to of each token.
|
slot_mapping: The address to write the new KV to of each token.
|
||||||
context_lens: the length of attention context for each generation token.
|
|
||||||
max_context_len: The maximum context length.
|
max_context_len: The maximum context length.
|
||||||
|
context_lens: the length of attention context for each sequence.
|
||||||
block_tables: The block tables. (Seq id -> list of physical block)
|
block_tables: The block tables. (Seq id -> list of physical block)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
seq_groups: List[Tuple[List[int], SamplingParams]],
|
|
||||||
seq_data: Dict[int, SequenceData],
|
|
||||||
prompt_lens: List[int],
|
prompt_lens: List[int],
|
||||||
slot_mapping: torch.Tensor,
|
slot_mapping: torch.Tensor,
|
||||||
context_lens: torch.Tensor,
|
max_context_len: Optional[int],
|
||||||
max_context_len: int,
|
context_lens: Optional[torch.Tensor],
|
||||||
block_tables: torch.Tensor,
|
block_tables: Optional[torch.Tensor],
|
||||||
) -> None:
|
) -> None:
|
||||||
self.seq_groups = seq_groups
|
|
||||||
self.seq_data = seq_data
|
|
||||||
self.prompt_lens = prompt_lens
|
self.prompt_lens = prompt_lens
|
||||||
|
self.max_context_len = max_context_len
|
||||||
self.slot_mapping = slot_mapping
|
self.slot_mapping = slot_mapping
|
||||||
self.context_lens = context_lens
|
self.context_lens = context_lens
|
||||||
self.max_context_len = max_context_len
|
|
||||||
self.block_tables = block_tables
|
self.block_tables = block_tables
|
||||||
|
|
||||||
self.num_prompts = len(prompt_lens)
|
self.is_prompt = len(prompt_lens) > 0
|
||||||
self.num_prompt_tokens = sum(prompt_lens)
|
|
||||||
self.num_generation_tokens = context_lens.shape[0]
|
|
||||||
self.num_valid_tokens = slot_mapping.shape[0]
|
|
||||||
if block_tables.numel() > 0:
|
|
||||||
self.max_num_blocks_per_seq = block_tables.shape[1]
|
|
||||||
else:
|
|
||||||
self.max_num_blocks_per_seq = 0
|
|
||||||
assert block_tables.shape[0] == self.num_generation_tokens
|
|
||||||
assert context_lens.shape[0] == self.num_generation_tokens
|
|
||||||
|
|
||||||
# Set during the execution of the first attention op.
|
# Set during the execution of the first attention op.
|
||||||
self.attn_bias: List[AttentionBias] = []
|
# FIXME(woosuk): This is a hack.
|
||||||
|
self.attn_bias = None
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
# Print only useful metadata.
|
return ("InputMetadata("
|
||||||
return (f'InputMetadata('
|
f"prompt_lens={self.prompt_lens}, "
|
||||||
f'num_valid_tokens={self.num_valid_tokens}, '
|
f"max_context_len={self.max_context_len}, "
|
||||||
f'num_prompt_tokens={self.num_prompt_tokens}, '
|
f"slot_mapping={self.slot_mapping}, "
|
||||||
f'num_prompts={self.num_prompts}, '
|
f"context_lens={self.context_lens}, "
|
||||||
f'prompt_lens={self.prompt_lens}, '
|
f"block_tables={self.block_tables})")
|
||||||
f'num_generation_tokens={self.num_generation_tokens}, '
|
|
||||||
f'context_lens={self.context_lens}, '
|
|
||||||
f'max_context_len={self.max_context_len}), '
|
|
||||||
f'max_num_blocks_per_seq={self.max_num_blocks_per_seq}, '
|
|
||||||
f'block_tables={self.block_tables}), '
|
|
||||||
f'slot_mapping={self.slot_mapping}')
|
|
||||||
|
|||||||
@@ -1,48 +1,113 @@
|
|||||||
"""Custom activation functions."""
|
"""Custom activation functions."""
|
||||||
|
import math
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
from vllm import activation_ops
|
from vllm._C import ops
|
||||||
|
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||||
|
from vllm.model_executor.parallel_utils.parallel_state import (
|
||||||
|
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
|
||||||
|
from vllm.model_executor.parallel_utils.utils import divide
|
||||||
|
from vllm.model_executor.utils import set_weight_attrs
|
||||||
|
|
||||||
|
|
||||||
class SiluAndMul(nn.Module):
|
class SiluAndMul(nn.Module):
|
||||||
"""An activation function for SwiGLU.
|
"""An activation function for SwiGLU.
|
||||||
|
|
||||||
The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[1] // 2.
|
The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
|
||||||
|
|
||||||
Shapes:
|
Shapes:
|
||||||
x: (num_tokens, 2 * d)
|
x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
|
||||||
return: (num_tokens, d)
|
return: (batch_size, seq_len, d) or (num_tokens, d)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def _forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
|
"""PyTorch-native implementation equivalent to forward()."""
|
||||||
|
d = x.shape[-1] // 2
|
||||||
|
return F.silu(x[..., :d]) * x[..., d:]
|
||||||
|
|
||||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
num_tokens = x.shape[0]
|
d = x.shape[-1] // 2
|
||||||
d = x.shape[1] // 2
|
output_shape = (x.shape[:-1] + (d, ))
|
||||||
out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device)
|
out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
|
||||||
activation_ops.silu_and_mul(out, x)
|
ops.silu_and_mul(out, x)
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
class NewGELU(nn.Module):
|
class NewGELU(nn.Module):
|
||||||
|
|
||||||
|
def _forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
|
"""PyTorch-native implementation equivalent to forward()."""
|
||||||
|
c = math.sqrt(2.0 / math.pi)
|
||||||
|
return 0.5 * x * (1.0 + torch.tanh(c *
|
||||||
|
(x + 0.044715 * torch.pow(x, 3.0))))
|
||||||
|
|
||||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
num_tokens = x.shape[0]
|
out = torch.empty_like(x)
|
||||||
d = x.shape[1]
|
ops.gelu_new(out, x)
|
||||||
out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device)
|
|
||||||
activation_ops.gelu_new(out, x)
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
class FastGELU(nn.Module):
|
class FastGELU(nn.Module):
|
||||||
|
|
||||||
|
def _forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
|
"""PyTorch-native implementation equivalent to forward()."""
|
||||||
|
return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 *
|
||||||
|
(1.0 + 0.044715 * x * x)))
|
||||||
|
|
||||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
num_tokens = x.shape[0]
|
out = torch.empty_like(x)
|
||||||
d = x.shape[1]
|
ops.gelu_fast(out, x)
|
||||||
out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device)
|
|
||||||
activation_ops.gelu_fast(out, x)
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class ScaledActivation(nn.Module):
|
||||||
|
"""An activation function with post-scale parameters.
|
||||||
|
|
||||||
|
This is used for some quantization methods like AWQ.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
act_module: nn.Module,
|
||||||
|
intermediate_size: int,
|
||||||
|
input_is_parallel: bool = True,
|
||||||
|
params_dtype: Optional[torch.dtype] = None,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.act = act_module
|
||||||
|
self.input_is_parallel = input_is_parallel
|
||||||
|
if input_is_parallel:
|
||||||
|
tp_size = get_tensor_model_parallel_world_size()
|
||||||
|
intermediate_size_per_partition = divide(intermediate_size,
|
||||||
|
tp_size)
|
||||||
|
else:
|
||||||
|
intermediate_size_per_partition = intermediate_size
|
||||||
|
if params_dtype is None:
|
||||||
|
params_dtype = torch.get_default_dtype()
|
||||||
|
self.scales = nn.Parameter(
|
||||||
|
torch.empty(intermediate_size_per_partition,
|
||||||
|
dtype=params_dtype,
|
||||||
|
device="cuda"))
|
||||||
|
set_weight_attrs(self.scales, {"weight_loader": self.weight_loader})
|
||||||
|
|
||||||
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
|
return self.act(x) / self.scales
|
||||||
|
|
||||||
|
def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
|
||||||
|
param_data = param.data
|
||||||
|
if self.input_is_parallel:
|
||||||
|
tp_rank = get_tensor_model_parallel_rank()
|
||||||
|
shard_size = param_data.shape[0]
|
||||||
|
start_idx = tp_rank * shard_size
|
||||||
|
loaded_weight = loaded_weight.narrow(0, start_idx, shard_size)
|
||||||
|
assert param_data.shape == loaded_weight.shape
|
||||||
|
param_data.copy_(loaded_weight)
|
||||||
|
|
||||||
|
|
||||||
_ACTIVATION_REGISTRY = {
|
_ACTIVATION_REGISTRY = {
|
||||||
"gelu": nn.GELU(),
|
"gelu": nn.GELU(),
|
||||||
"gelu_fast": FastGELU(),
|
"gelu_fast": FastGELU(),
|
||||||
@@ -52,9 +117,25 @@ _ACTIVATION_REGISTRY = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_act_fn(act_fn: str) -> nn.Module:
|
def get_act_fn(
|
||||||
|
act_fn_name: str,
|
||||||
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
|
intermediate_size: Optional[int] = None,
|
||||||
|
input_is_parallel: bool = True,
|
||||||
|
params_dtype: Optional[torch.dtype] = None,
|
||||||
|
) -> nn.Module:
|
||||||
"""Get an activation function by name."""
|
"""Get an activation function by name."""
|
||||||
act_fn = act_fn.lower()
|
act_fn_name = act_fn_name.lower()
|
||||||
if act_fn in _ACTIVATION_REGISTRY:
|
if act_fn_name not in _ACTIVATION_REGISTRY:
|
||||||
return _ACTIVATION_REGISTRY[act_fn]
|
raise ValueError(
|
||||||
raise ValueError(f"Activation function {act_fn!r} is not supported.")
|
f"Activation function {act_fn_name!r} is not supported.")
|
||||||
|
|
||||||
|
act_fn = _ACTIVATION_REGISTRY[act_fn_name]
|
||||||
|
if (quant_config is not None
|
||||||
|
and act_fn_name in quant_config.get_scaled_act_names()):
|
||||||
|
if intermediate_size is None:
|
||||||
|
raise ValueError("intermediate_size must be specified for scaled "
|
||||||
|
"activation functions.")
|
||||||
|
return ScaledActivation(act_fn, intermediate_size, input_is_parallel,
|
||||||
|
params_dtype)
|
||||||
|
return act_fn
|
||||||
|
|||||||