Compare commits
25 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
791d79de32 | ||
|
|
94d2f59895 | ||
|
|
75c0ca9d43 | ||
|
|
2a4ec90854 | ||
|
|
85ebcda94d | ||
|
|
d64bf1646c | ||
|
|
a41c20435e | ||
|
|
eedac9dba0 | ||
|
|
14f9c72bfd | ||
|
|
ad5f2fe34c | ||
|
|
4f8584756d | ||
|
|
65fc1c3127 | ||
|
|
c393af6cd7 | ||
|
|
0c04ce3234 | ||
|
|
73b3de79ea | ||
|
|
d1744376ae | ||
|
|
805de738f6 | ||
|
|
1b151ed181 | ||
|
|
e06f504a76 | ||
|
|
462ae5220a | ||
|
|
66c54aa9c3 | ||
|
|
735ecfff61 | ||
|
|
a57d13cc96 | ||
|
|
79af7e96a0 | ||
|
|
621980bdc0 |
101
.github/workflows/publish.yml
vendored
Normal file
101
.github/workflows/publish.yml
vendored
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
# This workflow will upload a Python Package to Release asset
|
||||||
|
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions
|
||||||
|
|
||||||
|
name: Create Release
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- v*
|
||||||
|
|
||||||
|
# Needed to create release and upload assets
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
release:
|
||||||
|
# Retrieve tag and create release
|
||||||
|
name: Create Release
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
outputs:
|
||||||
|
upload_url: ${{ steps.create_release.outputs.upload_url }}
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Extract branch info
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Create Release
|
||||||
|
id: create_release
|
||||||
|
uses: "actions/github-script@v6"
|
||||||
|
env:
|
||||||
|
RELEASE_TAG: ${{ env.release_tag }}
|
||||||
|
with:
|
||||||
|
github-token: "${{ secrets.GITHUB_TOKEN }}"
|
||||||
|
script: |
|
||||||
|
const script = require('.github/workflows/scripts/create_release.js')
|
||||||
|
await script(github, context, core)
|
||||||
|
|
||||||
|
wheel:
|
||||||
|
name: Build Wheel
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
needs: release
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
os: ['ubuntu-20.04']
|
||||||
|
python-version: ['3.8', '3.9', '3.10', '3.11']
|
||||||
|
cuda-version: ['11.8'] # Github runner can't build anything older than 11.8
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Set up Linux Env
|
||||||
|
if: ${{ runner.os == 'Linux' }}
|
||||||
|
run: |
|
||||||
|
bash -x .github/workflows/scripts/env.sh
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
|
||||||
|
- name: Install CUDA ${{ matrix.cuda-version }}
|
||||||
|
run: |
|
||||||
|
bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
|
||||||
|
|
||||||
|
- name: Install PyTorch-cu${{ matrix.cuda-version }}
|
||||||
|
run: |
|
||||||
|
bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
|
||||||
|
|
||||||
|
- name: Build wheel
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
|
||||||
|
wheel_name=$(ls dist/*whl | xargs -n 1 basename)
|
||||||
|
asset_name=${wheel_name//"linux"/"manylinux1"}
|
||||||
|
echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
|
||||||
|
echo "asset_name=${asset_name}" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Upload Release Asset
|
||||||
|
uses: actions/upload-release-asset@v1
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
with:
|
||||||
|
upload_url: ${{ needs.release.outputs.upload_url }}
|
||||||
|
asset_path: ./dist/${{ env.wheel_name }}
|
||||||
|
asset_name: ${{ env.asset_name }}
|
||||||
|
asset_content_type: application/*
|
||||||
|
|
||||||
|
# (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
|
||||||
|
# - name: Publish package
|
||||||
|
# uses: pypa/gh-action-pypi-publish@release/v1.8
|
||||||
|
# with:
|
||||||
|
# repository-url: https://test.pypi.org/legacy/
|
||||||
|
# password: ${{ secrets.PYPI_API_TOKEN }}
|
||||||
|
# skip-existing: true
|
||||||
15
.github/workflows/scripts/build.sh
vendored
Normal file
15
.github/workflows/scripts/build.sh
vendored
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
python_executable=python$1
|
||||||
|
cuda_home=/usr/local/cuda-$2
|
||||||
|
|
||||||
|
# Update paths
|
||||||
|
PATH=${cuda_home}/bin:$PATH
|
||||||
|
LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
|
# Install requirements
|
||||||
|
$python_executable -m pip install wheel packaging
|
||||||
|
$python_executable -m pip install -r requirements.txt
|
||||||
|
|
||||||
|
# Build
|
||||||
|
$python_executable setup.py bdist_wheel --dist-dir=dist
|
||||||
20
.github/workflows/scripts/create_release.js
vendored
Normal file
20
.github/workflows/scripts/create_release.js
vendored
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
// Uses Github's API to create the release and wait for result.
|
||||||
|
// We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
|
||||||
|
|
||||||
|
module.exports = async (github, context, core) => {
|
||||||
|
try {
|
||||||
|
const response = await github.rest.repos.createRelease({
|
||||||
|
draft: false,
|
||||||
|
generate_release_notes: true,
|
||||||
|
name: process.env.RELEASE_TAG,
|
||||||
|
owner: context.repo.owner,
|
||||||
|
prerelease: false,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
tag_name: process.env.RELEASE_TAG,
|
||||||
|
});
|
||||||
|
|
||||||
|
core.setOutput('upload_url', response.data.upload_url);
|
||||||
|
} catch (error) {
|
||||||
|
core.setFailed(error.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
18
.github/workflows/scripts/cuda-install.sh
vendored
Normal file
18
.github/workflows/scripts/cuda-install.sh
vendored
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Replace '.' with '-' ex: 11.8 -> 11-8
|
||||||
|
cuda_version=$(echo $1 | tr "." "-")
|
||||||
|
# Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
|
||||||
|
OS=$(echo $2 | tr -d ".\-")
|
||||||
|
|
||||||
|
# Installs CUDA
|
||||||
|
wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
|
||||||
|
sudo dpkg -i cuda-keyring_1.1-1_all.deb
|
||||||
|
rm cuda-keyring_1.1-1_all.deb
|
||||||
|
sudo apt -qq update
|
||||||
|
sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
|
||||||
|
sudo apt clean
|
||||||
|
|
||||||
|
# Test nvcc
|
||||||
|
PATH=/usr/local/cuda-$1/bin:${PATH}
|
||||||
|
nvcc --version
|
||||||
56
.github/workflows/scripts/env.sh
vendored
Normal file
56
.github/workflows/scripts/env.sh
vendored
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This file installs common linux environment tools
|
||||||
|
|
||||||
|
export LANG C.UTF-8
|
||||||
|
|
||||||
|
# python_version=$1
|
||||||
|
|
||||||
|
sudo apt-get update && \
|
||||||
|
sudo apt-get install -y --no-install-recommends \
|
||||||
|
software-properties-common \
|
||||||
|
|
||||||
|
sudo apt-get install -y --no-install-recommends \
|
||||||
|
build-essential \
|
||||||
|
apt-utils \
|
||||||
|
ca-certificates \
|
||||||
|
wget \
|
||||||
|
git \
|
||||||
|
vim \
|
||||||
|
libssl-dev \
|
||||||
|
curl \
|
||||||
|
unzip \
|
||||||
|
unrar \
|
||||||
|
cmake \
|
||||||
|
net-tools \
|
||||||
|
sudo \
|
||||||
|
autotools-dev \
|
||||||
|
rsync \
|
||||||
|
jq \
|
||||||
|
openssh-server \
|
||||||
|
tmux \
|
||||||
|
screen \
|
||||||
|
htop \
|
||||||
|
pdsh \
|
||||||
|
openssh-client \
|
||||||
|
lshw \
|
||||||
|
dmidecode \
|
||||||
|
util-linux \
|
||||||
|
automake \
|
||||||
|
autoconf \
|
||||||
|
libtool \
|
||||||
|
net-tools \
|
||||||
|
pciutils \
|
||||||
|
libpci-dev \
|
||||||
|
libaio-dev \
|
||||||
|
libcap2 \
|
||||||
|
libtinfo5 \
|
||||||
|
fakeroot \
|
||||||
|
devscripts \
|
||||||
|
debhelper \
|
||||||
|
nfs-common
|
||||||
|
|
||||||
|
# Remove github bloat files to free up disk space
|
||||||
|
sudo rm -rf "/usr/local/share/boost"
|
||||||
|
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
|
||||||
|
sudo rm -rf "/usr/share/dotnet"
|
||||||
14
.github/workflows/scripts/pytorch-install.sh
vendored
Normal file
14
.github/workflows/scripts/pytorch-install.sh
vendored
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
python_executable=python$1
|
||||||
|
cuda_version=$2
|
||||||
|
|
||||||
|
# Install torch
|
||||||
|
$python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
|
||||||
|
$python_executable -m pip install torch -f https://download.pytorch.org/whl/cu${cuda_version//./}/torch_stable.html
|
||||||
|
|
||||||
|
# Print version information
|
||||||
|
$python_executable --version
|
||||||
|
$python_executable -c "import torch; print('PyTorch:', torch.__version__)"
|
||||||
|
$python_executable -c "import torch; print('CUDA:', torch.version.cuda)"
|
||||||
|
$python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
|
||||||
@@ -42,6 +42,7 @@ vLLM is flexible and easy to use with:
|
|||||||
|
|
||||||
vLLM seamlessly supports many Huggingface models, including the following architectures:
|
vLLM seamlessly supports many Huggingface models, including the following architectures:
|
||||||
|
|
||||||
|
- Aquila (`BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
|
||||||
- Baichuan (`baichuan-inc/Baichuan-7B`, `baichuan-inc/Baichuan-13B-Chat`, etc.)
|
- Baichuan (`baichuan-inc/Baichuan-7B`, `baichuan-inc/Baichuan-13B-Chat`, etc.)
|
||||||
- BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
|
- BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
|
||||||
- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
|
- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
|
||||||
@@ -49,9 +50,11 @@ vLLM seamlessly supports many Huggingface models, including the following archit
|
|||||||
- GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
|
- GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
|
||||||
- GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
|
- GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
|
||||||
- GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
|
- GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
|
||||||
|
- InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
|
||||||
- LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
|
- LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
|
||||||
- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
|
- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
|
||||||
- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
|
- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
|
||||||
|
- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
|
||||||
|
|
||||||
Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
|
Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
|
||||||
|
|
||||||
|
|||||||
@@ -4,9 +4,25 @@ void silu_and_mul(
|
|||||||
torch::Tensor& out,
|
torch::Tensor& out,
|
||||||
torch::Tensor& input);
|
torch::Tensor& input);
|
||||||
|
|
||||||
|
void gelu_new(
|
||||||
|
torch::Tensor& out,
|
||||||
|
torch::Tensor& input);
|
||||||
|
|
||||||
|
void gelu_fast(
|
||||||
|
torch::Tensor& out,
|
||||||
|
torch::Tensor& input);
|
||||||
|
|
||||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||||
m.def(
|
m.def(
|
||||||
"silu_and_mul",
|
"silu_and_mul",
|
||||||
&silu_and_mul,
|
&silu_and_mul,
|
||||||
"Activation function used in SwiGLU.");
|
"Activation function used in SwiGLU.");
|
||||||
|
m.def(
|
||||||
|
"gelu_new",
|
||||||
|
&gelu_new,
|
||||||
|
"GELU implementation used in GPT-2.");
|
||||||
|
m.def(
|
||||||
|
"gelu_fast",
|
||||||
|
&gelu_fast,
|
||||||
|
"Approximate GELU implementation.");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -46,3 +46,71 @@ void silu_and_mul(
|
|||||||
d);
|
d);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
namespace vllm {
|
||||||
|
|
||||||
|
// Element-wise activation kernel template.
|
||||||
|
template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
|
||||||
|
__global__ void activation_kernel(
|
||||||
|
scalar_t* __restrict__ out, // [num_tokens, d]
|
||||||
|
const scalar_t* __restrict__ input, // [num_tokens, d]
|
||||||
|
const int d) {
|
||||||
|
const int token_idx = blockIdx.x;
|
||||||
|
for (int idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
||||||
|
const scalar_t x = __ldg(&input[token_idx * d + idx]);
|
||||||
|
out[token_idx * d + idx] = ACT_FN(x);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace vllm
|
||||||
|
|
||||||
|
// Launch element-wise activation kernel.
|
||||||
|
#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \
|
||||||
|
int num_tokens = input.size(0); \
|
||||||
|
int d = input.size(1); \
|
||||||
|
dim3 grid(num_tokens); \
|
||||||
|
dim3 block(std::min(d, 1024)); \
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
|
||||||
|
AT_DISPATCH_FLOATING_TYPES_AND2( \
|
||||||
|
at::ScalarType::Half, \
|
||||||
|
at::ScalarType::BFloat16, \
|
||||||
|
input.scalar_type(), \
|
||||||
|
"activation_kernel", \
|
||||||
|
[&] { \
|
||||||
|
vllm::activation_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>( \
|
||||||
|
out.data_ptr<scalar_t>(), \
|
||||||
|
input.data_ptr<scalar_t>(), \
|
||||||
|
d); \
|
||||||
|
});
|
||||||
|
|
||||||
|
namespace vllm {
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
__device__ __forceinline__ T gelu_new_kernel(const T& x) {
|
||||||
|
const float x3 = (float) (x * x * x);
|
||||||
|
const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3))));
|
||||||
|
return ((T) 0.5) * x * (((T) 1.0) + t);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
__device__ __forceinline__ T gelu_fast_kernel(const T& x) {
|
||||||
|
const float f = (float) x;
|
||||||
|
const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x));
|
||||||
|
return ((T) 0.5) * x * (((T) 1.0) + t);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace vllm
|
||||||
|
|
||||||
|
void gelu_new(
|
||||||
|
torch::Tensor& out, // [num_tokens, d]
|
||||||
|
torch::Tensor& input) // [num_tokens, d]
|
||||||
|
{
|
||||||
|
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gelu_fast(
|
||||||
|
torch::Tensor& out, // [num_tokens, d]
|
||||||
|
torch::Tensor& input) // [num_tokens, d]
|
||||||
|
{
|
||||||
|
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
|
||||||
|
}
|
||||||
|
|||||||
@@ -86,6 +86,8 @@ __global__ void single_query_cached_kv_attention_kernel(
|
|||||||
const int kv_block_stride,
|
const int kv_block_stride,
|
||||||
const int kv_head_stride) {
|
const int kv_head_stride) {
|
||||||
constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
|
constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
|
||||||
|
constexpr int NUM_THREAD_GROUPS = NUM_THREADS / THREAD_GROUP_SIZE; // Note: This assumes THREAD_GROUP_SIZE divides NUM_THREADS
|
||||||
|
assert(NUM_THREADS % THREAD_GROUP_SIZE == 0);
|
||||||
constexpr int NUM_TOKENS_PER_THREAD_GROUP = (BLOCK_SIZE + WARP_SIZE - 1) / WARP_SIZE;
|
constexpr int NUM_TOKENS_PER_THREAD_GROUP = (BLOCK_SIZE + WARP_SIZE - 1) / WARP_SIZE;
|
||||||
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
||||||
const int thread_idx = threadIdx.x;
|
const int thread_idx = threadIdx.x;
|
||||||
@@ -120,12 +122,13 @@ __global__ void single_query_cached_kv_attention_kernel(
|
|||||||
// th vectors of the query, and so on.
|
// th vectors of the query, and so on.
|
||||||
// NOTE(woosuk): Because q is split from a qkv tensor, it may not be contiguous.
|
// NOTE(woosuk): Because q is split from a qkv tensor, it may not be contiguous.
|
||||||
const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
|
const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
|
||||||
Q_vec q_vecs[NUM_VECS_PER_THREAD];
|
__shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < NUM_VECS_PER_THREAD; i++) {
|
for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD; i += NUM_THREAD_GROUPS) {
|
||||||
const int vec_idx = thread_group_offset + i * THREAD_GROUP_SIZE;
|
const int vec_idx = thread_group_offset + i * THREAD_GROUP_SIZE;
|
||||||
q_vecs[i] = *reinterpret_cast<const Q_vec*>(q_ptr + vec_idx * VEC_SIZE);
|
q_vecs[thread_group_offset][i] = *reinterpret_cast<const Q_vec*>(q_ptr + vec_idx * VEC_SIZE);
|
||||||
}
|
}
|
||||||
|
__syncthreads(); // TODO(naed90): possible speedup if this is replaced with a memory wall right before we use q_vecs
|
||||||
|
|
||||||
// Memory planning.
|
// Memory planning.
|
||||||
extern __shared__ char shared_mem[];
|
extern __shared__ char shared_mem[];
|
||||||
@@ -173,7 +176,7 @@ __global__ void single_query_cached_kv_attention_kernel(
|
|||||||
|
|
||||||
// Compute dot product.
|
// Compute dot product.
|
||||||
// This includes a reduction across the threads in the same thread group.
|
// This includes a reduction across the threads in the same thread group.
|
||||||
float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs, k_vecs);
|
float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs);
|
||||||
// Add the ALiBi bias if slopes are given.
|
// Add the ALiBi bias if slopes are given.
|
||||||
qk += (alibi_slope != 0) ? alibi_slope * (token_idx - context_len) : 0;
|
qk += (alibi_slope != 0) ? alibi_slope * (token_idx - context_len) : 0;
|
||||||
|
|
||||||
|
|||||||
@@ -14,9 +14,12 @@ Alongside each architecture, we include some popular models that use it.
|
|||||||
* - Architecture
|
* - Architecture
|
||||||
- Models
|
- Models
|
||||||
- Example HuggingFace Models
|
- Example HuggingFace Models
|
||||||
|
* - :code:`AquilaForCausalLM`
|
||||||
|
- Aqualia
|
||||||
|
- :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc.
|
||||||
* - :code:`BaiChuanForCausalLM`
|
* - :code:`BaiChuanForCausalLM`
|
||||||
- Baichuan
|
- Baichuan
|
||||||
- :code:`baichuan-inc/Baichuan-7B`, `baichuan-inc/Baichuan-13B-Chat`, etc.
|
- :code:`baichuan-inc/Baichuan-7B`, :code:`baichuan-inc/Baichuan-13B-Chat`, etc.
|
||||||
* - :code:`BloomForCausalLM`
|
* - :code:`BloomForCausalLM`
|
||||||
- BLOOM, BLOOMZ, BLOOMChat
|
- BLOOM, BLOOMZ, BLOOMChat
|
||||||
- :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc.
|
- :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc.
|
||||||
@@ -35,6 +38,9 @@ Alongside each architecture, we include some popular models that use it.
|
|||||||
* - :code:`GPTNeoXForCausalLM`
|
* - :code:`GPTNeoXForCausalLM`
|
||||||
- GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
|
- GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
|
||||||
- :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc.
|
- :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc.
|
||||||
|
* - :code:`InternLMForCausalLM`
|
||||||
|
- InternLM
|
||||||
|
- :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc.
|
||||||
* - :code:`LlamaForCausalLM`
|
* - :code:`LlamaForCausalLM`
|
||||||
- LLaMA, LLaMA-2, Vicuna, Alpaca, Koala, Guanaco
|
- LLaMA, LLaMA-2, Vicuna, Alpaca, Koala, Guanaco
|
||||||
- :code:`meta-llama/Llama-2-13b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`young-geng/koala`, :code:`JosephusCheung/Guanaco`, etc.
|
- :code:`meta-llama/Llama-2-13b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`young-geng/koala`, :code:`JosephusCheung/Guanaco`, etc.
|
||||||
@@ -44,6 +50,9 @@ Alongside each architecture, we include some popular models that use it.
|
|||||||
* - :code:`OPTForCausalLM`
|
* - :code:`OPTForCausalLM`
|
||||||
- OPT, OPT-IML
|
- OPT, OPT-IML
|
||||||
- :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.
|
- :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.
|
||||||
|
* - :code:`OPTForCausalLM`
|
||||||
|
- Qwen
|
||||||
|
- :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
|
||||||
|
|
||||||
If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
|
If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
|
||||||
Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` for instructions on how to implement support for your model.
|
Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` for instructions on how to implement support for your model.
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ sentencepiece # Required for LLaMA tokenizer.
|
|||||||
numpy
|
numpy
|
||||||
torch >= 2.0.0
|
torch >= 2.0.0
|
||||||
transformers >= 4.31.0 # Required for LLaMA-2.
|
transformers >= 4.31.0 # Required for LLaMA-2.
|
||||||
xformers >= 0.0.19
|
xformers >= 0.0.21
|
||||||
fastapi
|
fastapi
|
||||||
uvicorn
|
uvicorn
|
||||||
pydantic < 2 # Required for OpenAI server.
|
pydantic < 2 # Required for OpenAI server.
|
||||||
|
|||||||
29
setup.py
29
setup.py
@@ -22,7 +22,7 @@ NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
|
|||||||
|
|
||||||
if CUDA_HOME is None:
|
if CUDA_HOME is None:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Cannot find CUDA_HOME. CUDA must be available in order to build the package.")
|
f"Cannot find CUDA_HOME. CUDA must be available to build the package.")
|
||||||
|
|
||||||
|
|
||||||
def get_nvcc_cuda_version(cuda_dir: str) -> Version:
|
def get_nvcc_cuda_version(cuda_dir: str) -> Version:
|
||||||
@@ -47,12 +47,6 @@ for i in range(device_count):
|
|||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"GPUs with compute capability less than 7.0 are not supported.")
|
"GPUs with compute capability less than 7.0 are not supported.")
|
||||||
compute_capabilities.add(major * 10 + minor)
|
compute_capabilities.add(major * 10 + minor)
|
||||||
# If no GPU is available, add all supported compute capabilities.
|
|
||||||
if not compute_capabilities:
|
|
||||||
compute_capabilities = {70, 75, 80, 86, 90}
|
|
||||||
# Add target compute capabilities to NVCC flags.
|
|
||||||
for capability in compute_capabilities:
|
|
||||||
NVCC_FLAGS += ["-gencode", f"arch=compute_{capability},code=sm_{capability}"]
|
|
||||||
|
|
||||||
# Validate the NVCC CUDA version.
|
# Validate the NVCC CUDA version.
|
||||||
nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
|
nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
|
||||||
@@ -61,10 +55,31 @@ if nvcc_cuda_version < Version("11.0"):
|
|||||||
if 86 in compute_capabilities and nvcc_cuda_version < Version("11.1"):
|
if 86 in compute_capabilities and nvcc_cuda_version < Version("11.1"):
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"CUDA 11.1 or higher is required for GPUs with compute capability 8.6.")
|
"CUDA 11.1 or higher is required for GPUs with compute capability 8.6.")
|
||||||
|
if 89 in compute_capabilities and nvcc_cuda_version < Version("11.8"):
|
||||||
|
# CUDA 11.8 is required to generate the code targeting compute capability 8.9.
|
||||||
|
# However, GPUs with compute capability 8.9 can also run the code generated by
|
||||||
|
# the previous versions of CUDA 11 and targeting compute capability 8.0.
|
||||||
|
# Therefore, if CUDA 11.8 is not available, we target compute capability 8.0
|
||||||
|
# instead of 8.9.
|
||||||
|
compute_capabilities.remove(89)
|
||||||
|
compute_capabilities.add(80)
|
||||||
if 90 in compute_capabilities and nvcc_cuda_version < Version("11.8"):
|
if 90 in compute_capabilities and nvcc_cuda_version < Version("11.8"):
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"CUDA 11.8 or higher is required for GPUs with compute capability 9.0.")
|
"CUDA 11.8 or higher is required for GPUs with compute capability 9.0.")
|
||||||
|
|
||||||
|
# If no GPU is available, add all supported compute capabilities.
|
||||||
|
if not compute_capabilities:
|
||||||
|
compute_capabilities = {70, 75, 80}
|
||||||
|
if nvcc_cuda_version >= Version("11.1"):
|
||||||
|
compute_capabilities.add(86)
|
||||||
|
if nvcc_cuda_version >= Version("11.8"):
|
||||||
|
compute_capabilities.add(89)
|
||||||
|
compute_capabilities.add(90)
|
||||||
|
|
||||||
|
# Add target compute capabilities to NVCC flags.
|
||||||
|
for capability in compute_capabilities:
|
||||||
|
NVCC_FLAGS += ["-gencode", f"arch=compute_{capability},code=sm_{capability}"]
|
||||||
|
|
||||||
# Use NVCC threads to parallelize the build.
|
# Use NVCC threads to parallelize the build.
|
||||||
if nvcc_cuda_version >= Version("11.2"):
|
if nvcc_cuda_version >= Version("11.2"):
|
||||||
num_threads = min(os.cpu_count(), 8)
|
num_threads = min(os.cpu_count(), 8)
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
|
from transformers.activations import get_activation
|
||||||
from vllm import activation_ops
|
from vllm import activation_ops
|
||||||
|
|
||||||
|
|
||||||
@@ -28,3 +28,45 @@ def test_silu_and_mul() -> None:
|
|||||||
for d in [512, 4096, 5120, 13824]:
|
for d in [512, 4096, 5120, 13824]:
|
||||||
print(f'Testing dtype={dtype}, num_tokens={num_tokens}, d={d}')
|
print(f'Testing dtype={dtype}, num_tokens={num_tokens}, d={d}')
|
||||||
run_silu_and_mul(num_tokens, d, dtype)
|
run_silu_and_mul(num_tokens, d, dtype)
|
||||||
|
|
||||||
|
|
||||||
|
@torch.inference_mode()
|
||||||
|
def run_gelu_new(
|
||||||
|
num_tokens: int,
|
||||||
|
d: int,
|
||||||
|
dtype: torch.dtype,
|
||||||
|
) -> None:
|
||||||
|
x = torch.randn(num_tokens, d, dtype=dtype, device='cuda')
|
||||||
|
out = torch.empty(num_tokens, d, dtype=dtype, device='cuda')
|
||||||
|
activation_ops.gelu_new(out, x)
|
||||||
|
ref_out = get_activation("gelu_new")(x)
|
||||||
|
assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
|
||||||
|
|
||||||
|
|
||||||
|
def test_gelu_new() -> None:
|
||||||
|
for dtype in [torch.half, torch.bfloat16, torch.float]:
|
||||||
|
for num_tokens in [7, 83, 2048]:
|
||||||
|
for d in [512, 4096, 5120, 13824]:
|
||||||
|
print(f'Testing dtype={dtype}, num_tokens={num_tokens}, d={d}')
|
||||||
|
run_gelu_new(num_tokens, d, dtype)
|
||||||
|
|
||||||
|
|
||||||
|
@torch.inference_mode()
|
||||||
|
def run_gelu_fast(
|
||||||
|
num_tokens: int,
|
||||||
|
d: int,
|
||||||
|
dtype: torch.dtype,
|
||||||
|
) -> None:
|
||||||
|
x = torch.randn(num_tokens, d, dtype=dtype, device='cuda')
|
||||||
|
out = torch.empty(num_tokens, d, dtype=dtype, device='cuda')
|
||||||
|
activation_ops.gelu_fast(out, x)
|
||||||
|
ref_out = get_activation("gelu_fast")(x)
|
||||||
|
assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
|
||||||
|
|
||||||
|
|
||||||
|
def test_gelu_fast() -> None:
|
||||||
|
for dtype in [torch.half, torch.bfloat16, torch.float]:
|
||||||
|
for num_tokens in [7, 83, 2048]:
|
||||||
|
for d in [512, 4096, 5120, 13824]:
|
||||||
|
print(f'Testing dtype={dtype}, num_tokens={num_tokens}, d={d}')
|
||||||
|
run_gelu_fast(num_tokens, d, dtype)
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ from vllm.entrypoints.llm import LLM
|
|||||||
from vllm.outputs import CompletionOutput, RequestOutput
|
from vllm.outputs import CompletionOutput, RequestOutput
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
|
|
||||||
__version__ = "0.1.3"
|
__version__ = "0.1.4"
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"LLM",
|
"LLM",
|
||||||
|
|||||||
@@ -98,9 +98,11 @@ class ModelConfig:
|
|||||||
# Note: for falcon, when new_decoder_architecture is True, the
|
# Note: for falcon, when new_decoder_architecture is True, the
|
||||||
# multi_query flag is ignored and we use n_head_kv for the number of
|
# multi_query flag is ignored and we use n_head_kv for the number of
|
||||||
# KV heads.
|
# KV heads.
|
||||||
if (getattr(self.hf_config, "multi_query", False) and
|
new_decoder_arch_falcon = (
|
||||||
(self.hf_config.model_type == "falcon" and
|
self.hf_config.model_type == "falcon"
|
||||||
not getattr(self.hf_config, "new_decoder_architecture", False))):
|
and getattr(self.hf_config, "new_decoder_architecture", False))
|
||||||
|
if not new_decoder_arch_falcon and getattr(self.hf_config,
|
||||||
|
"multi_query", False):
|
||||||
# Multi-query attention, only one KV head.
|
# Multi-query attention, only one KV head.
|
||||||
return 1
|
return 1
|
||||||
# For Falcon:
|
# For Falcon:
|
||||||
|
|||||||
@@ -379,9 +379,6 @@ class Scheduler:
|
|||||||
seq_group: SequenceGroup,
|
seq_group: SequenceGroup,
|
||||||
blocks_to_swap_out: Dict[int, int],
|
blocks_to_swap_out: Dict[int, int],
|
||||||
) -> None:
|
) -> None:
|
||||||
seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
|
|
||||||
for seq in seqs:
|
|
||||||
seq.status = SequenceStatus.SWAPPED
|
|
||||||
self._swap_out(seq_group, blocks_to_swap_out)
|
self._swap_out(seq_group, blocks_to_swap_out)
|
||||||
self.swapped.append(seq_group)
|
self.swapped.append(seq_group)
|
||||||
|
|
||||||
|
|||||||
@@ -3,18 +3,18 @@
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
from http import HTTPStatus
|
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
from typing import AsyncGenerator, Dict, List, Optional
|
from http import HTTPStatus
|
||||||
from packaging import version
|
from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
import fastapi
|
import fastapi
|
||||||
|
import uvicorn
|
||||||
from fastapi import BackgroundTasks, Request
|
from fastapi import BackgroundTasks, Request
|
||||||
from fastapi.exceptions import RequestValidationError
|
from fastapi.exceptions import RequestValidationError
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from fastapi.responses import JSONResponse, StreamingResponse
|
from fastapi.responses import JSONResponse, StreamingResponse
|
||||||
import uvicorn
|
from packaging import version
|
||||||
|
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||||
@@ -115,12 +115,22 @@ async def get_gen_prompt(request) -> str:
|
|||||||
return prompt
|
return prompt
|
||||||
|
|
||||||
|
|
||||||
async def check_length(request, prompt):
|
async def check_length(
|
||||||
input_ids = tokenizer(prompt).input_ids
|
request: Union[ChatCompletionRequest, CompletionRequest],
|
||||||
|
prompt: Optional[str] = None,
|
||||||
|
prompt_ids: Optional[List[int]] = None
|
||||||
|
) -> Tuple[List[int], Optional[JSONResponse]]:
|
||||||
|
assert (not (prompt is None and prompt_ids is None)
|
||||||
|
and not (prompt is not None and prompt_ids is not None)
|
||||||
|
), "Either prompt or prompt_ids should be provided."
|
||||||
|
if prompt_ids is not None:
|
||||||
|
input_ids = prompt_ids
|
||||||
|
else:
|
||||||
|
input_ids = tokenizer(prompt).input_ids
|
||||||
token_num = len(input_ids)
|
token_num = len(input_ids)
|
||||||
|
|
||||||
if token_num + request.max_tokens > max_model_len:
|
if token_num + request.max_tokens > max_model_len:
|
||||||
return create_error_response(
|
return input_ids, create_error_response(
|
||||||
HTTPStatus.BAD_REQUEST,
|
HTTPStatus.BAD_REQUEST,
|
||||||
f"This model's maximum context length is {max_model_len} tokens. "
|
f"This model's maximum context length is {max_model_len} tokens. "
|
||||||
f"However, you requested {request.max_tokens + token_num} tokens "
|
f"However, you requested {request.max_tokens + token_num} tokens "
|
||||||
@@ -129,7 +139,7 @@ async def check_length(request, prompt):
|
|||||||
f"Please reduce the length of the messages or completion.",
|
f"Please reduce the length of the messages or completion.",
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
return None
|
return input_ids, None
|
||||||
|
|
||||||
|
|
||||||
@app.get("/v1/models")
|
@app.get("/v1/models")
|
||||||
@@ -191,7 +201,7 @@ async def create_chat_completion(raw_request: Request):
|
|||||||
"logit_bias is not currently supported")
|
"logit_bias is not currently supported")
|
||||||
|
|
||||||
prompt = await get_gen_prompt(request)
|
prompt = await get_gen_prompt(request)
|
||||||
error_check_ret = await check_length(request, prompt)
|
token_ids, error_check_ret = await check_length(request, prompt=prompt)
|
||||||
if error_check_ret is not None:
|
if error_check_ret is not None:
|
||||||
return error_check_ret
|
return error_check_ret
|
||||||
|
|
||||||
@@ -215,7 +225,8 @@ async def create_chat_completion(raw_request: Request):
|
|||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
|
return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
|
||||||
|
|
||||||
result_generator = engine.generate(prompt, sampling_params, request_id)
|
result_generator = engine.generate(prompt, sampling_params, request_id,
|
||||||
|
token_ids)
|
||||||
|
|
||||||
async def abort_request() -> None:
|
async def abort_request() -> None:
|
||||||
await engine.abort(request_id)
|
await engine.abort(request_id)
|
||||||
@@ -375,17 +386,34 @@ async def create_completion(raw_request: Request):
|
|||||||
|
|
||||||
model_name = request.model
|
model_name = request.model
|
||||||
request_id = f"cmpl-{random_uuid()}"
|
request_id = f"cmpl-{random_uuid()}"
|
||||||
|
|
||||||
|
use_token_ids = False
|
||||||
if isinstance(request.prompt, list):
|
if isinstance(request.prompt, list):
|
||||||
if len(request.prompt) == 0:
|
if len(request.prompt) == 0:
|
||||||
return create_error_response(HTTPStatus.BAD_REQUEST,
|
return create_error_response(HTTPStatus.BAD_REQUEST,
|
||||||
"please provide at least one prompt")
|
"please provide at least one prompt")
|
||||||
if len(request.prompt) > 1:
|
first_element = request.prompt[0]
|
||||||
return create_error_response(
|
if isinstance(first_element, int):
|
||||||
HTTPStatus.BAD_REQUEST,
|
use_token_ids = True
|
||||||
"multiple prompts in a batch is not currently supported")
|
prompt = request.prompt
|
||||||
prompt = request.prompt[0]
|
elif isinstance(first_element, (str, list)):
|
||||||
|
# TODO: handles multiple prompt case in list[list[int]]
|
||||||
|
if len(request.prompt) > 1:
|
||||||
|
return create_error_response(
|
||||||
|
HTTPStatus.BAD_REQUEST,
|
||||||
|
"multiple prompts in a batch is not currently supported")
|
||||||
|
use_token_ids = not isinstance(first_element, str)
|
||||||
|
prompt = request.prompt[0]
|
||||||
else:
|
else:
|
||||||
prompt = request.prompt
|
prompt = request.prompt
|
||||||
|
|
||||||
|
if use_token_ids:
|
||||||
|
_, error_check_ret = await check_length(request, prompt_ids=prompt)
|
||||||
|
else:
|
||||||
|
token_ids, error_check_ret = await check_length(request, prompt=prompt)
|
||||||
|
if error_check_ret is not None:
|
||||||
|
return error_check_ret
|
||||||
|
|
||||||
created_time = int(time.time())
|
created_time = int(time.time())
|
||||||
try:
|
try:
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
@@ -405,7 +433,14 @@ async def create_completion(raw_request: Request):
|
|||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
|
return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
|
||||||
|
|
||||||
result_generator = engine.generate(prompt, sampling_params, request_id)
|
if use_token_ids:
|
||||||
|
result_generator = engine.generate(None,
|
||||||
|
sampling_params,
|
||||||
|
request_id,
|
||||||
|
prompt_token_ids=prompt)
|
||||||
|
else:
|
||||||
|
result_generator = engine.generate(prompt, sampling_params, request_id,
|
||||||
|
token_ids)
|
||||||
|
|
||||||
# Similar to the OpenAI API, when n != best_of, we do not stream the
|
# Similar to the OpenAI API, when n != best_of, we do not stream the
|
||||||
# results. In addition, we do not stream the results when use beam search.
|
# results. In addition, we do not stream the results when use beam search.
|
||||||
|
|||||||
@@ -74,7 +74,8 @@ class ChatCompletionRequest(BaseModel):
|
|||||||
|
|
||||||
class CompletionRequest(BaseModel):
|
class CompletionRequest(BaseModel):
|
||||||
model: str
|
model: str
|
||||||
prompt: Union[str, List[str]]
|
# a string, array of strings, array of tokens, or array of token arrays
|
||||||
|
prompt: Union[List[int], List[List[int]], str, List[str]]
|
||||||
suffix: Optional[str] = None
|
suffix: Optional[str] = None
|
||||||
max_tokens: Optional[int] = 16
|
max_tokens: Optional[int] = 16
|
||||||
temperature: Optional[float] = 1.0
|
temperature: Optional[float] = 1.0
|
||||||
|
|||||||
@@ -4,23 +4,6 @@ import torch.nn as nn
|
|||||||
|
|
||||||
from vllm import activation_ops
|
from vllm import activation_ops
|
||||||
|
|
||||||
_ACTIVATION_REGISTRY = {
|
|
||||||
"gelu": nn.GELU(),
|
|
||||||
# NOTE: The following GELU functions may introduce small rounding errors.
|
|
||||||
"gelu_new": nn.GELU(approximate="tanh"),
|
|
||||||
"gelu_fast": nn.GELU(approximate="tanh"),
|
|
||||||
"gelu_pytorch_tanh": nn.GELU(approximate="tanh"),
|
|
||||||
"relu": nn.ReLU(),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def get_act_fn(act_fn: str) -> nn.Module:
|
|
||||||
"""Get an activation function by name."""
|
|
||||||
act_fn = act_fn.lower()
|
|
||||||
if act_fn in _ACTIVATION_REGISTRY:
|
|
||||||
return _ACTIVATION_REGISTRY[act_fn]
|
|
||||||
raise ValueError(f"Activation function {act_fn!r} is not supported.")
|
|
||||||
|
|
||||||
|
|
||||||
class SiluAndMul(nn.Module):
|
class SiluAndMul(nn.Module):
|
||||||
"""An activation function for SwiGLU.
|
"""An activation function for SwiGLU.
|
||||||
@@ -38,3 +21,40 @@ class SiluAndMul(nn.Module):
|
|||||||
out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device)
|
out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device)
|
||||||
activation_ops.silu_and_mul(out, x)
|
activation_ops.silu_and_mul(out, x)
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class NewGELU(nn.Module):
|
||||||
|
|
||||||
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
|
num_tokens = x.shape[0]
|
||||||
|
d = x.shape[1]
|
||||||
|
out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device)
|
||||||
|
activation_ops.gelu_new(out, x)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class FastGELU(nn.Module):
|
||||||
|
|
||||||
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
|
num_tokens = x.shape[0]
|
||||||
|
d = x.shape[1]
|
||||||
|
out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device)
|
||||||
|
activation_ops.gelu_fast(out, x)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
_ACTIVATION_REGISTRY = {
|
||||||
|
"gelu": nn.GELU(),
|
||||||
|
"gelu_fast": FastGELU(),
|
||||||
|
"gelu_new": NewGELU(),
|
||||||
|
"gelu_pytorch_tanh": nn.GELU(approximate="tanh"),
|
||||||
|
"relu": nn.ReLU(),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_act_fn(act_fn: str) -> nn.Module:
|
||||||
|
"""Get an activation function by name."""
|
||||||
|
act_fn = act_fn.lower()
|
||||||
|
if act_fn in _ACTIVATION_REGISTRY:
|
||||||
|
return _ACTIVATION_REGISTRY[act_fn]
|
||||||
|
raise ValueError(f"Activation function {act_fn!r} is not supported.")
|
||||||
|
|||||||
@@ -357,11 +357,12 @@ class PagedAttentionWithALiBi(PagedAttention):
|
|||||||
# be sliced from a tensor whose length is a multiple of 8.
|
# be sliced from a tensor whose length is a multiple of 8.
|
||||||
padded_len = (prompt_len + 7) // 8 * 8
|
padded_len = (prompt_len + 7) // 8 * 8
|
||||||
bias = torch.empty(
|
bias = torch.empty(
|
||||||
|
1, # batch_size
|
||||||
self.num_heads,
|
self.num_heads,
|
||||||
padded_len,
|
prompt_len,
|
||||||
padded_len,
|
padded_len,
|
||||||
device=self.alibi_slopes.device,
|
device=self.alibi_slopes.device,
|
||||||
)[:, :prompt_len, :prompt_len].copy_(bias)
|
)[:, :, :, :prompt_len].copy_(bias)
|
||||||
bias.mul_(self.alibi_slopes[:, None, None])
|
bias.mul_(self.alibi_slopes[:, None, None])
|
||||||
attn_bias = LowerTriangularMaskWithTensorBias(bias)
|
attn_bias = LowerTriangularMaskWithTensorBias(bias)
|
||||||
input_metadata.attn_bias.append(attn_bias)
|
input_metadata.attn_bias.append(attn_bias)
|
||||||
|
|||||||
@@ -71,20 +71,20 @@ class Sampler(nn.Module):
|
|||||||
# Use in-place division to avoid creating a new tensor.
|
# Use in-place division to avoid creating a new tensor.
|
||||||
logits.div_(t.unsqueeze(dim=1))
|
logits.div_(t.unsqueeze(dim=1))
|
||||||
|
|
||||||
|
# Apply top-p and top-k truncation.
|
||||||
|
top_ps, top_ks = _get_top_p_top_k(input_metadata, self.vocab_size)
|
||||||
|
assert len(top_ps) == len(top_ks) == logits.shape[0]
|
||||||
|
do_top_p = any(p < 1.0 - _SAMPLING_EPS for p in top_ps)
|
||||||
|
do_top_k = any(k != self.vocab_size for k in top_ks)
|
||||||
|
if do_top_p or do_top_k:
|
||||||
|
logits = _apply_top_p_top_k(logits, top_ps, top_ks)
|
||||||
|
|
||||||
# We use float32 for probabilities and log probabilities.
|
# We use float32 for probabilities and log probabilities.
|
||||||
# Compute the probabilities.
|
# Compute the probabilities.
|
||||||
probs = torch.softmax(logits, dim=-1, dtype=torch.float)
|
probs = torch.softmax(logits, dim=-1, dtype=torch.float)
|
||||||
# Compute the log probabilities (before applying top-p and top-k).
|
# Compute the log probabilities (before applying top-p and top-k).
|
||||||
logprobs = torch.log(probs)
|
logprobs = torch.log(probs)
|
||||||
|
|
||||||
# Apply top-p and top-k truncation.
|
|
||||||
top_ps, top_ks = _get_top_p_top_k(input_metadata, self.vocab_size)
|
|
||||||
assert len(top_ps) == len(top_ks) == probs.shape[0]
|
|
||||||
do_top_p = any(p < 1.0 - _SAMPLING_EPS for p in top_ps)
|
|
||||||
do_top_k = any(k != self.vocab_size for k in top_ks)
|
|
||||||
if do_top_p or do_top_k:
|
|
||||||
probs = _apply_top_p_top_k(probs, top_ps, top_ks)
|
|
||||||
|
|
||||||
# Sample the next tokens.
|
# Sample the next tokens.
|
||||||
return _sample(probs, logprobs, input_metadata)
|
return _sample(probs, logprobs, input_metadata)
|
||||||
|
|
||||||
@@ -235,31 +235,32 @@ def _get_top_p_top_k(
|
|||||||
|
|
||||||
|
|
||||||
def _apply_top_p_top_k(
|
def _apply_top_p_top_k(
|
||||||
probs: torch.Tensor,
|
logits: torch.Tensor,
|
||||||
top_ps: List[float],
|
top_ps: List[float],
|
||||||
top_ks: List[int],
|
top_ks: List[int],
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
p = torch.tensor(top_ps, dtype=probs.dtype, device=probs.device)
|
p = torch.tensor(top_ps, dtype=logits.dtype, device=logits.device)
|
||||||
k = torch.tensor(top_ks, dtype=torch.int, device=probs.device)
|
k = torch.tensor(top_ks, dtype=torch.int, device=logits.device)
|
||||||
probs_sort, probs_idx = probs.sort(dim=-1, descending=True)
|
logits_sort, logits_idx = logits.sort(dim=-1, descending=True)
|
||||||
|
|
||||||
# Apply top-p.
|
# Apply top-p.
|
||||||
probs_sum = torch.cumsum(probs_sort, dim=-1)
|
probs_sort = logits_sort.softmax(dim=-1)
|
||||||
|
probs_sum = probs_sort.cumsum(dim=-1)
|
||||||
top_p_mask = (probs_sum - probs_sort) > p.unsqueeze(dim=1)
|
top_p_mask = (probs_sum - probs_sort) > p.unsqueeze(dim=1)
|
||||||
probs_sort[top_p_mask] = 0.0
|
logits_sort[top_p_mask] = -float("inf")
|
||||||
|
|
||||||
# Apply top-k.
|
# Apply top-k.
|
||||||
# Create a mask for the top-k elements.
|
# Create a mask for the top-k elements.
|
||||||
top_k_mask = torch.arange(probs_idx.shape[-1], device=probs_idx.device)
|
top_k_mask = torch.arange(logits_idx.shape[-1], device=logits_idx.device)
|
||||||
top_k_mask = top_k_mask.expand(probs_idx.shape[0], -1)
|
top_k_mask = top_k_mask.expand(logits_idx.shape[0], -1)
|
||||||
top_k_mask = top_k_mask >= k.unsqueeze(dim=1)
|
top_k_mask = top_k_mask >= k.unsqueeze(dim=1)
|
||||||
probs_sort[top_k_mask] = 0.0
|
logits_sort[top_k_mask] = -float("inf")
|
||||||
|
|
||||||
# Re-sort the probabilities.
|
# Re-sort the probabilities.
|
||||||
probs = torch.gather(probs_sort,
|
logits = torch.gather(logits_sort,
|
||||||
dim=-1,
|
dim=-1,
|
||||||
index=torch.argsort(probs_idx, dim=-1))
|
index=torch.argsort(logits_idx, dim=-1))
|
||||||
return probs
|
return logits
|
||||||
|
|
||||||
|
|
||||||
def _get_topk_logprobs(
|
def _get_topk_logprobs(
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ from vllm.model_executor.weight_utils import initialize_dummy_weights
|
|||||||
|
|
||||||
# TODO(woosuk): Lazy-load the model classes.
|
# TODO(woosuk): Lazy-load the model classes.
|
||||||
_MODEL_REGISTRY = {
|
_MODEL_REGISTRY = {
|
||||||
|
"AquilaModel": AquilaForCausalLM,
|
||||||
"BaiChuanForCausalLM": BaiChuanForCausalLM, # baichuan-7b
|
"BaiChuanForCausalLM": BaiChuanForCausalLM, # baichuan-7b
|
||||||
"BaichuanForCausalLM": BaichuanForCausalLM, # baichuan-13b
|
"BaichuanForCausalLM": BaichuanForCausalLM, # baichuan-13b
|
||||||
"BloomForCausalLM": BloomForCausalLM,
|
"BloomForCausalLM": BloomForCausalLM,
|
||||||
@@ -19,10 +20,12 @@ _MODEL_REGISTRY = {
|
|||||||
"GPTBigCodeForCausalLM": GPTBigCodeForCausalLM,
|
"GPTBigCodeForCausalLM": GPTBigCodeForCausalLM,
|
||||||
"GPTJForCausalLM": GPTJForCausalLM,
|
"GPTJForCausalLM": GPTJForCausalLM,
|
||||||
"GPTNeoXForCausalLM": GPTNeoXForCausalLM,
|
"GPTNeoXForCausalLM": GPTNeoXForCausalLM,
|
||||||
|
"InternLMForCausalLM": InternLMForCausalLM,
|
||||||
"LlamaForCausalLM": LlamaForCausalLM,
|
"LlamaForCausalLM": LlamaForCausalLM,
|
||||||
"LLaMAForCausalLM": LlamaForCausalLM, # For decapoda-research/llama-*
|
"LLaMAForCausalLM": LlamaForCausalLM, # For decapoda-research/llama-*
|
||||||
"MPTForCausalLM": MPTForCausalLM,
|
"MPTForCausalLM": MPTForCausalLM,
|
||||||
"OPTForCausalLM": OPTForCausalLM,
|
"OPTForCausalLM": OPTForCausalLM,
|
||||||
|
"QWenLMHeadModel": QWenLMHeadModel,
|
||||||
"RWForCausalLM": FalconForCausalLM,
|
"RWForCausalLM": FalconForCausalLM,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
from vllm.model_executor.models.aquila import AquilaForCausalLM
|
||||||
from vllm.model_executor.models.baichuan import (BaiChuanForCausalLM,
|
from vllm.model_executor.models.baichuan import (BaiChuanForCausalLM,
|
||||||
BaichuanForCausalLM)
|
BaichuanForCausalLM)
|
||||||
from vllm.model_executor.models.bloom import BloomForCausalLM
|
from vllm.model_executor.models.bloom import BloomForCausalLM
|
||||||
@@ -6,11 +7,14 @@ from vllm.model_executor.models.gpt2 import GPT2LMHeadModel
|
|||||||
from vllm.model_executor.models.gpt_bigcode import GPTBigCodeForCausalLM
|
from vllm.model_executor.models.gpt_bigcode import GPTBigCodeForCausalLM
|
||||||
from vllm.model_executor.models.gpt_j import GPTJForCausalLM
|
from vllm.model_executor.models.gpt_j import GPTJForCausalLM
|
||||||
from vllm.model_executor.models.gpt_neox import GPTNeoXForCausalLM
|
from vllm.model_executor.models.gpt_neox import GPTNeoXForCausalLM
|
||||||
|
from vllm.model_executor.models.internlm import InternLMForCausalLM
|
||||||
from vllm.model_executor.models.llama import LlamaForCausalLM
|
from vllm.model_executor.models.llama import LlamaForCausalLM
|
||||||
from vllm.model_executor.models.mpt import MPTForCausalLM
|
from vllm.model_executor.models.mpt import MPTForCausalLM
|
||||||
from vllm.model_executor.models.opt import OPTForCausalLM
|
from vllm.model_executor.models.opt import OPTForCausalLM
|
||||||
|
from vllm.model_executor.models.qwen import QWenLMHeadModel
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
|
"AquilaForCausalLM",
|
||||||
"BaiChuanForCausalLM",
|
"BaiChuanForCausalLM",
|
||||||
"BaichuanForCausalLM",
|
"BaichuanForCausalLM",
|
||||||
"BloomForCausalLM",
|
"BloomForCausalLM",
|
||||||
@@ -19,7 +23,9 @@ __all__ = [
|
|||||||
"GPTBigCodeForCausalLM",
|
"GPTBigCodeForCausalLM",
|
||||||
"GPTJForCausalLM",
|
"GPTJForCausalLM",
|
||||||
"GPTNeoXForCausalLM",
|
"GPTNeoXForCausalLM",
|
||||||
|
"InternLMForCausalLM",
|
||||||
"LlamaForCausalLM",
|
"LlamaForCausalLM",
|
||||||
"MPTForCausalLM",
|
"MPTForCausalLM",
|
||||||
"OPTForCausalLM",
|
"OPTForCausalLM",
|
||||||
|
"QWenLMHeadModel",
|
||||||
]
|
]
|
||||||
|
|||||||
362
vllm/model_executor/models/aquila.py
Normal file
362
vllm/model_executor/models/aquila.py
Normal file
@@ -0,0 +1,362 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Adapted from
|
||||||
|
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
|
||||||
|
# Copyright 2023 The vLLM team.
|
||||||
|
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
||||||
|
#
|
||||||
|
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||||
|
# and OPT implementations in this library. It has been modified from its
|
||||||
|
# original forms to accommodate minor architectural differences compared
|
||||||
|
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Inference-only LLaMA model compatible with HuggingFace weights.
|
||||||
|
|
||||||
|
The input of the model is flattened to a 1D tensor of tokens. The model uses
|
||||||
|
InputMetadata to extract the original 2D shape of the input.
|
||||||
|
"""
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
|
||||||
|
from vllm.model_executor.input_metadata import InputMetadata
|
||||||
|
from vllm.model_executor.layers.activation import SiluAndMul
|
||||||
|
from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
|
||||||
|
from vllm.model_executor.layers.sampler import Sampler
|
||||||
|
from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
|
||||||
|
load_tensor_parallel_weights)
|
||||||
|
from vllm.model_executor.parallel_utils.parallel_state import (
|
||||||
|
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
|
||||||
|
from vllm.model_executor.parallel_utils.tensor_parallel import (
|
||||||
|
VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
|
||||||
|
from vllm.sequence import SequenceOutputs
|
||||||
|
from vllm.transformers_utils.configs.aquila import AquilaConfig
|
||||||
|
|
||||||
|
KVCache = Tuple[torch.Tensor, torch.Tensor]
|
||||||
|
|
||||||
|
|
||||||
|
class AquilaMLP(nn.Module):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
hidden_size: int,
|
||||||
|
intermediate_size: int,
|
||||||
|
hidden_act: str,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.gate_up_proj = ColumnParallelLinear(hidden_size,
|
||||||
|
2 * intermediate_size,
|
||||||
|
bias=False,
|
||||||
|
gather_output=False,
|
||||||
|
perform_initialization=False)
|
||||||
|
self.down_proj = RowParallelLinear(intermediate_size,
|
||||||
|
hidden_size,
|
||||||
|
bias=False,
|
||||||
|
input_is_parallel=True,
|
||||||
|
perform_initialization=False)
|
||||||
|
if hidden_act != "silu":
|
||||||
|
raise ValueError(f"Unsupported activation: {hidden_act}. "
|
||||||
|
"Only silu is supported for now.")
|
||||||
|
self.act_fn = SiluAndMul()
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
gate_up, _ = self.gate_up_proj(x)
|
||||||
|
x = self.act_fn(gate_up)
|
||||||
|
x, _ = self.down_proj(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class AquilaRMSNorm(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, hidden_size, eps=1e-6):
|
||||||
|
"""
|
||||||
|
AquilaRMSNorm is equivalent to T5LayerNorm
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.weight = nn.Parameter(torch.ones(hidden_size))
|
||||||
|
self.variance_epsilon = eps
|
||||||
|
|
||||||
|
def forward(self, hidden_states):
|
||||||
|
input_dtype = hidden_states.dtype
|
||||||
|
variance = hidden_states.to(torch.float32).pow(2).mean(-1,
|
||||||
|
keepdim=True)
|
||||||
|
hidden_states = hidden_states * torch.rsqrt(variance +
|
||||||
|
self.variance_epsilon)
|
||||||
|
|
||||||
|
return (self.weight * hidden_states).to(input_dtype)
|
||||||
|
|
||||||
|
|
||||||
|
class AquilaAttention(nn.Module):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
hidden_size: int,
|
||||||
|
num_heads: int,
|
||||||
|
num_kv_heads: int,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
tp_size = get_tensor_model_parallel_world_size()
|
||||||
|
self.total_num_heads = num_heads
|
||||||
|
assert self.total_num_heads % tp_size == 0
|
||||||
|
self.num_heads = self.total_num_heads // tp_size
|
||||||
|
self.total_num_kv_heads = num_kv_heads
|
||||||
|
assert self.total_num_kv_heads % tp_size == 0
|
||||||
|
self.num_kv_heads = self.total_num_kv_heads // tp_size
|
||||||
|
self.head_dim = hidden_size // self.total_num_heads
|
||||||
|
self.q_size = self.num_heads * self.head_dim
|
||||||
|
self.kv_size = self.num_kv_heads * self.head_dim
|
||||||
|
self.scaling = self.head_dim**-0.5
|
||||||
|
|
||||||
|
self.qkv_proj = ColumnParallelLinear(
|
||||||
|
hidden_size,
|
||||||
|
(self.total_num_heads + 2 * self.total_num_kv_heads) *
|
||||||
|
self.head_dim,
|
||||||
|
bias=False,
|
||||||
|
gather_output=False,
|
||||||
|
perform_initialization=False,
|
||||||
|
)
|
||||||
|
self.o_proj = RowParallelLinear(
|
||||||
|
self.total_num_heads * self.head_dim,
|
||||||
|
hidden_size,
|
||||||
|
bias=False,
|
||||||
|
input_is_parallel=True,
|
||||||
|
perform_initialization=False,
|
||||||
|
)
|
||||||
|
self.attn = PagedAttentionWithRoPE(
|
||||||
|
self.num_heads,
|
||||||
|
self.head_dim,
|
||||||
|
self.scaling,
|
||||||
|
rotary_dim=self.head_dim,
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
positions: torch.Tensor,
|
||||||
|
hidden_states: torch.Tensor,
|
||||||
|
kv_cache: KVCache,
|
||||||
|
input_metadata: InputMetadata,
|
||||||
|
cache_event: Optional[torch.cuda.Event],
|
||||||
|
) -> torch.Tensor:
|
||||||
|
qkv, _ = self.qkv_proj(hidden_states)
|
||||||
|
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
|
||||||
|
k_cache, v_cache = kv_cache
|
||||||
|
attn_output = self.attn(positions, q, k, v, k_cache, v_cache,
|
||||||
|
input_metadata, cache_event)
|
||||||
|
output, _ = self.o_proj(attn_output)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
class AquilaDecoderLayer(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, config: AquilaConfig):
|
||||||
|
super().__init__()
|
||||||
|
self.hidden_size = config.hidden_size
|
||||||
|
self.self_attn = AquilaAttention(
|
||||||
|
hidden_size=self.hidden_size,
|
||||||
|
num_heads=config.num_attention_heads,
|
||||||
|
num_kv_heads=config.num_attention_heads,
|
||||||
|
)
|
||||||
|
self.mlp = AquilaMLP(
|
||||||
|
hidden_size=self.hidden_size,
|
||||||
|
intermediate_size=config.intermediate_size,
|
||||||
|
hidden_act=config.hidden_act,
|
||||||
|
)
|
||||||
|
self.input_layernorm = AquilaRMSNorm(config.hidden_size,
|
||||||
|
eps=config.rms_norm_eps)
|
||||||
|
self.post_attention_layernorm = AquilaRMSNorm(config.hidden_size,
|
||||||
|
eps=config.rms_norm_eps)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
positions: torch.Tensor,
|
||||||
|
hidden_states: torch.Tensor,
|
||||||
|
kv_cache: KVCache,
|
||||||
|
input_metadata: InputMetadata,
|
||||||
|
cache_event: Optional[torch.cuda.Event],
|
||||||
|
) -> torch.Tensor:
|
||||||
|
# Self Attention
|
||||||
|
residual = hidden_states
|
||||||
|
hidden_states = self.input_layernorm(hidden_states)
|
||||||
|
hidden_states = self.self_attn(
|
||||||
|
positions=positions,
|
||||||
|
hidden_states=hidden_states,
|
||||||
|
kv_cache=kv_cache,
|
||||||
|
input_metadata=input_metadata,
|
||||||
|
cache_event=cache_event,
|
||||||
|
)
|
||||||
|
hidden_states = residual + hidden_states
|
||||||
|
|
||||||
|
# Fully Connected
|
||||||
|
residual = hidden_states
|
||||||
|
hidden_states = self.post_attention_layernorm(hidden_states)
|
||||||
|
hidden_states = self.mlp(hidden_states)
|
||||||
|
hidden_states = residual + hidden_states
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
class AquilaModel(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, config: AquilaConfig):
|
||||||
|
super().__init__()
|
||||||
|
self.config = config
|
||||||
|
self.padding_idx = config.pad_token_id
|
||||||
|
self.vocab_size = config.vocab_size
|
||||||
|
|
||||||
|
#vocab_size = ((config.vocab_size + 63) // 64) * 64
|
||||||
|
self.embed_tokens = VocabParallelEmbedding(
|
||||||
|
config.vocab_size,
|
||||||
|
config.hidden_size,
|
||||||
|
perform_initialization=False)
|
||||||
|
self.layers = nn.ModuleList([
|
||||||
|
AquilaDecoderLayer(config) for _ in range(config.num_hidden_layers)
|
||||||
|
])
|
||||||
|
self.norm = AquilaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
input_ids: torch.Tensor,
|
||||||
|
positions: torch.Tensor,
|
||||||
|
kv_caches: List[KVCache],
|
||||||
|
input_metadata: InputMetadata,
|
||||||
|
cache_events: Optional[List[torch.cuda.Event]],
|
||||||
|
) -> torch.Tensor:
|
||||||
|
hidden_states = self.embed_tokens(input_ids)
|
||||||
|
for i in range(len(self.layers)):
|
||||||
|
if cache_events is None:
|
||||||
|
cache_event = None
|
||||||
|
else:
|
||||||
|
cache_event = cache_events[i]
|
||||||
|
layer = self.layers[i]
|
||||||
|
hidden_states = layer(
|
||||||
|
positions,
|
||||||
|
hidden_states,
|
||||||
|
kv_caches[i],
|
||||||
|
input_metadata,
|
||||||
|
cache_event,
|
||||||
|
)
|
||||||
|
hidden_states = self.norm(hidden_states)
|
||||||
|
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
class AquilaForCausalLM(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__()
|
||||||
|
self.config = config
|
||||||
|
self.model = AquilaModel(config)
|
||||||
|
vocab_size = ((config.vocab_size + 63) // 64) * 64
|
||||||
|
self.lm_head = ColumnParallelLinear(config.hidden_size,
|
||||||
|
vocab_size,
|
||||||
|
bias=False,
|
||||||
|
gather_output=False,
|
||||||
|
perform_initialization=False)
|
||||||
|
self.sampler = Sampler(config.vocab_size)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
input_ids: torch.Tensor,
|
||||||
|
positions: torch.Tensor,
|
||||||
|
kv_caches: List[KVCache],
|
||||||
|
input_metadata: InputMetadata,
|
||||||
|
cache_events: Optional[List[torch.cuda.Event]],
|
||||||
|
) -> Dict[int, SequenceOutputs]:
|
||||||
|
hidden_states = self.model(input_ids, positions, kv_caches,
|
||||||
|
input_metadata, cache_events)
|
||||||
|
next_tokens = self.sampler(self.lm_head.weight, hidden_states,
|
||||||
|
input_metadata)
|
||||||
|
return next_tokens
|
||||||
|
|
||||||
|
_column_parallel_weights = [
|
||||||
|
"embed_tokens.weight", "lm_head.weight", "qkv_proj.weight",
|
||||||
|
"gate_proj.weight", "up_proj.weight"
|
||||||
|
]
|
||||||
|
_row_parallel_weights = ["o_proj.weight", "down_proj.weight"]
|
||||||
|
|
||||||
|
def load_weights(self,
|
||||||
|
model_name_or_path: str,
|
||||||
|
cache_dir: Optional[str] = None,
|
||||||
|
use_np_cache: bool = False):
|
||||||
|
tp_size = get_tensor_model_parallel_world_size()
|
||||||
|
tensor_model_parallel_rank = get_tensor_model_parallel_rank()
|
||||||
|
q_proj_shard_size = (self.config.hidden_size // tp_size)
|
||||||
|
kv_proj_shard_size = (self.config.hidden_size //
|
||||||
|
self.config.num_attention_heads *
|
||||||
|
self.config.num_attention_heads // tp_size)
|
||||||
|
attention_weight_specs = [
|
||||||
|
# (weight_name, shard_size, offset)
|
||||||
|
("q_proj", q_proj_shard_size, 0),
|
||||||
|
("k_proj", kv_proj_shard_size, q_proj_shard_size),
|
||||||
|
("v_proj", kv_proj_shard_size,
|
||||||
|
q_proj_shard_size + kv_proj_shard_size),
|
||||||
|
]
|
||||||
|
state_dict = self.state_dict()
|
||||||
|
|
||||||
|
for name, loaded_weight in hf_model_weights_iterator(
|
||||||
|
model_name_or_path, cache_dir, use_np_cache):
|
||||||
|
if "rotary_emb.inv_freq" in name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if "embed_tokens" in name or "lm_head" in name:
|
||||||
|
param = state_dict[name]
|
||||||
|
# Consider padding in the vocab size.
|
||||||
|
padded_vocab_size = (param.shape[0] * tp_size)
|
||||||
|
num_extra_rows = padded_vocab_size - self.config.vocab_size
|
||||||
|
extra_rows = torch.empty(num_extra_rows,
|
||||||
|
loaded_weight.shape[1])
|
||||||
|
extra_rows = extra_rows.to(loaded_weight)
|
||||||
|
loaded_weight = torch.cat([loaded_weight, extra_rows], dim=0)
|
||||||
|
|
||||||
|
is_attention_weight = False
|
||||||
|
for weight_name, shard_size, offset in attention_weight_specs:
|
||||||
|
if weight_name not in name:
|
||||||
|
continue
|
||||||
|
param = state_dict[name.replace(weight_name, "qkv_proj")]
|
||||||
|
|
||||||
|
loaded_weight = loaded_weight[
|
||||||
|
shard_size * tensor_model_parallel_rank:shard_size *
|
||||||
|
(tensor_model_parallel_rank + 1)]
|
||||||
|
param_slice = param.data[offset:offset + shard_size]
|
||||||
|
assert param_slice.shape == loaded_weight.shape
|
||||||
|
|
||||||
|
param_slice.copy_(loaded_weight)
|
||||||
|
is_attention_weight = True
|
||||||
|
break
|
||||||
|
if is_attention_weight:
|
||||||
|
continue
|
||||||
|
|
||||||
|
is_gate_up_weight = False
|
||||||
|
for stride_id, weight_name in enumerate(["gate_proj", "up_proj"]):
|
||||||
|
if weight_name not in name:
|
||||||
|
continue
|
||||||
|
param = state_dict[name.replace(weight_name, "gate_up_proj")]
|
||||||
|
shard_size = param.shape[0] // 2
|
||||||
|
loaded_weight = loaded_weight[
|
||||||
|
shard_size * tensor_model_parallel_rank:shard_size *
|
||||||
|
(tensor_model_parallel_rank + 1)]
|
||||||
|
param_slice = param.data[shard_size * stride_id:shard_size *
|
||||||
|
(stride_id + 1)]
|
||||||
|
assert param_slice.shape == loaded_weight.shape
|
||||||
|
param_slice.copy_(loaded_weight)
|
||||||
|
is_gate_up_weight = True
|
||||||
|
break
|
||||||
|
if is_gate_up_weight:
|
||||||
|
continue
|
||||||
|
|
||||||
|
param = state_dict[name]
|
||||||
|
load_tensor_parallel_weights(param, loaded_weight, name,
|
||||||
|
self._column_parallel_weights,
|
||||||
|
self._row_parallel_weights,
|
||||||
|
tensor_model_parallel_rank)
|
||||||
@@ -49,10 +49,11 @@ class GPTBigCodeAttention(nn.Module):
|
|||||||
super().__init__()
|
super().__init__()
|
||||||
self.hidden_size = config.hidden_size
|
self.hidden_size = config.hidden_size
|
||||||
total_num_heads = config.num_attention_heads
|
total_num_heads = config.num_attention_heads
|
||||||
tensor_model_parallel_world_size = (
|
self.tensor_model_parallel_world_size = (
|
||||||
get_tensor_model_parallel_world_size())
|
get_tensor_model_parallel_world_size())
|
||||||
assert total_num_heads % tensor_model_parallel_world_size == 0
|
assert total_num_heads % self.tensor_model_parallel_world_size == 0
|
||||||
self.num_heads = total_num_heads // tensor_model_parallel_world_size
|
self.num_heads = (total_num_heads //
|
||||||
|
self.tensor_model_parallel_world_size)
|
||||||
self.head_dim = self.hidden_size // total_num_heads
|
self.head_dim = self.hidden_size // total_num_heads
|
||||||
self.scale = self.head_dim**-0.5
|
self.scale = self.head_dim**-0.5
|
||||||
|
|
||||||
@@ -101,7 +102,10 @@ class GPTBigCodeAttention(nn.Module):
|
|||||||
k, v = kv.split([self.kv_dim, self.kv_dim], dim=-1)
|
k, v = kv.split([self.kv_dim, self.kv_dim], dim=-1)
|
||||||
else:
|
else:
|
||||||
qkv, _ = self.c_attn(hidden_states)
|
qkv, _ = self.c_attn(hidden_states)
|
||||||
q, k, v = qkv.split([self.hidden_size, self.kv_dim, self.kv_dim],
|
q, k, v = qkv.split([
|
||||||
|
self.hidden_size // self.tensor_model_parallel_world_size,
|
||||||
|
self.kv_dim, self.kv_dim
|
||||||
|
],
|
||||||
dim=-1)
|
dim=-1)
|
||||||
key_cache, value_cache = kv_cache
|
key_cache, value_cache = kv_cache
|
||||||
attn_output = self.attn(q, k, v, key_cache, value_cache,
|
attn_output = self.attn(q, k, v, key_cache, value_cache,
|
||||||
|
|||||||
299
vllm/model_executor/models/internlm.py
Normal file
299
vllm/model_executor/models/internlm.py
Normal file
@@ -0,0 +1,299 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
from transformers import LlamaConfig
|
||||||
|
|
||||||
|
from vllm.model_executor.input_metadata import InputMetadata
|
||||||
|
from vllm.model_executor.layers.activation import SiluAndMul
|
||||||
|
from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
|
||||||
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
|
from vllm.model_executor.layers.sampler import Sampler
|
||||||
|
from vllm.model_executor.parallel_utils.parallel_state import (
|
||||||
|
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
|
||||||
|
from vllm.model_executor.parallel_utils.tensor_parallel import (
|
||||||
|
ColumnParallelLinear, RowParallelLinear, VocabParallelEmbedding)
|
||||||
|
from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
|
||||||
|
load_tensor_parallel_weights)
|
||||||
|
from vllm.sequence import SequenceOutputs
|
||||||
|
|
||||||
|
KVCache = Tuple[torch.Tensor, torch.Tensor]
|
||||||
|
|
||||||
|
|
||||||
|
class InternLMMLP(nn.Module):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
hidden_size: int,
|
||||||
|
intermediate_size: int,
|
||||||
|
hidden_act: str,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.gate_up_proj = ColumnParallelLinear(hidden_size,
|
||||||
|
2 * intermediate_size,
|
||||||
|
bias=False,
|
||||||
|
gather_output=False,
|
||||||
|
perform_initialization=False)
|
||||||
|
self.down_proj = RowParallelLinear(intermediate_size,
|
||||||
|
hidden_size,
|
||||||
|
bias=False,
|
||||||
|
input_is_parallel=True,
|
||||||
|
perform_initialization=False)
|
||||||
|
if hidden_act != "silu":
|
||||||
|
raise ValueError(f"Unsupported activation: {hidden_act}. "
|
||||||
|
"Only silu is supported for now.")
|
||||||
|
self.act_fn = SiluAndMul()
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
gate_up, _ = self.gate_up_proj(x)
|
||||||
|
x = self.act_fn(gate_up)
|
||||||
|
x, _ = self.down_proj(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class InternLMAttention(nn.Module):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
hidden_size: int,
|
||||||
|
num_heads: int,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
tensor_model_parallel_world_size = (
|
||||||
|
get_tensor_model_parallel_world_size())
|
||||||
|
self.total_num_heads = num_heads
|
||||||
|
assert self.total_num_heads % tensor_model_parallel_world_size == 0
|
||||||
|
self.num_heads = (self.total_num_heads //
|
||||||
|
tensor_model_parallel_world_size)
|
||||||
|
self.head_dim = hidden_size // self.total_num_heads
|
||||||
|
self.scaling = self.head_dim**-0.5
|
||||||
|
|
||||||
|
self.qkv_proj = ColumnParallelLinear(
|
||||||
|
hidden_size,
|
||||||
|
3 * self.total_num_heads * self.head_dim,
|
||||||
|
bias=True,
|
||||||
|
gather_output=False,
|
||||||
|
perform_initialization=False,
|
||||||
|
)
|
||||||
|
self.o_proj = RowParallelLinear(
|
||||||
|
self.total_num_heads * self.head_dim,
|
||||||
|
hidden_size,
|
||||||
|
bias=True,
|
||||||
|
input_is_parallel=True,
|
||||||
|
perform_initialization=False,
|
||||||
|
)
|
||||||
|
self.attn = PagedAttentionWithRoPE(self.num_heads,
|
||||||
|
self.head_dim,
|
||||||
|
self.scaling,
|
||||||
|
rotary_dim=self.head_dim)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
positions: torch.Tensor,
|
||||||
|
hidden_states: torch.Tensor,
|
||||||
|
kv_cache: KVCache,
|
||||||
|
input_metadata: InputMetadata,
|
||||||
|
cache_event: Optional[torch.cuda.Event],
|
||||||
|
) -> torch.Tensor:
|
||||||
|
qkv, _ = self.qkv_proj(hidden_states)
|
||||||
|
q, k, v = qkv.chunk(chunks=3, dim=-1)
|
||||||
|
k_cache, v_cache = kv_cache
|
||||||
|
attn_output = self.attn(positions, q, k, v, k_cache, v_cache,
|
||||||
|
input_metadata, cache_event)
|
||||||
|
output, _ = self.o_proj(attn_output)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
class InternLMDecoderLayer(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, config: LlamaConfig):
|
||||||
|
super().__init__()
|
||||||
|
self.hidden_size = config.hidden_size
|
||||||
|
self.self_attn = InternLMAttention(
|
||||||
|
hidden_size=self.hidden_size,
|
||||||
|
num_heads=config.num_attention_heads,
|
||||||
|
)
|
||||||
|
self.mlp = InternLMMLP(
|
||||||
|
hidden_size=self.hidden_size,
|
||||||
|
intermediate_size=config.intermediate_size,
|
||||||
|
hidden_act=config.hidden_act,
|
||||||
|
)
|
||||||
|
self.input_layernorm = RMSNorm(config.hidden_size,
|
||||||
|
eps=config.rms_norm_eps)
|
||||||
|
self.post_attention_layernorm = RMSNorm(config.hidden_size,
|
||||||
|
eps=config.rms_norm_eps)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
positions: torch.Tensor,
|
||||||
|
hidden_states: torch.Tensor,
|
||||||
|
kv_cache: KVCache,
|
||||||
|
input_metadata: InputMetadata,
|
||||||
|
cache_event: Optional[torch.cuda.Event],
|
||||||
|
) -> torch.Tensor:
|
||||||
|
# Self Attention
|
||||||
|
residual = hidden_states
|
||||||
|
hidden_states = self.input_layernorm(hidden_states)
|
||||||
|
hidden_states = self.self_attn(
|
||||||
|
positions=positions,
|
||||||
|
hidden_states=hidden_states,
|
||||||
|
kv_cache=kv_cache,
|
||||||
|
input_metadata=input_metadata,
|
||||||
|
cache_event=cache_event,
|
||||||
|
)
|
||||||
|
hidden_states = residual + hidden_states
|
||||||
|
|
||||||
|
# Fully Connected
|
||||||
|
residual = hidden_states
|
||||||
|
hidden_states = self.post_attention_layernorm(hidden_states)
|
||||||
|
hidden_states = self.mlp(hidden_states)
|
||||||
|
hidden_states = residual + hidden_states
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
class InternLMModel(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, config: LlamaConfig):
|
||||||
|
super().__init__()
|
||||||
|
self.config = config
|
||||||
|
self.padding_idx = config.pad_token_id
|
||||||
|
self.vocab_size = config.vocab_size
|
||||||
|
|
||||||
|
vocab_size = ((config.vocab_size + 63) // 64) * 64
|
||||||
|
self.embed_tokens = VocabParallelEmbedding(
|
||||||
|
vocab_size, config.hidden_size, perform_initialization=False)
|
||||||
|
self.layers = nn.ModuleList([
|
||||||
|
InternLMDecoderLayer(config)
|
||||||
|
for _ in range(config.num_hidden_layers)
|
||||||
|
])
|
||||||
|
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
input_ids: torch.Tensor,
|
||||||
|
positions: torch.Tensor,
|
||||||
|
kv_caches: List[KVCache],
|
||||||
|
input_metadata: InputMetadata,
|
||||||
|
cache_events: Optional[List[torch.cuda.Event]],
|
||||||
|
) -> torch.Tensor:
|
||||||
|
hidden_states = self.embed_tokens(input_ids)
|
||||||
|
for i in range(len(self.layers)):
|
||||||
|
if cache_events is None:
|
||||||
|
cache_event = None
|
||||||
|
else:
|
||||||
|
cache_event = cache_events[i]
|
||||||
|
layer = self.layers[i]
|
||||||
|
hidden_states = layer(
|
||||||
|
positions,
|
||||||
|
hidden_states,
|
||||||
|
kv_caches[i],
|
||||||
|
input_metadata,
|
||||||
|
cache_event,
|
||||||
|
)
|
||||||
|
hidden_states = self.norm(hidden_states)
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
class InternLMForCausalLM(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__()
|
||||||
|
self.config = config
|
||||||
|
self.model = InternLMModel(config)
|
||||||
|
vocab_size = ((config.vocab_size + 63) // 64) * 64
|
||||||
|
self.lm_head = ColumnParallelLinear(config.hidden_size,
|
||||||
|
vocab_size,
|
||||||
|
bias=False,
|
||||||
|
gather_output=False,
|
||||||
|
perform_initialization=False)
|
||||||
|
self.sampler = Sampler(config.vocab_size)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
input_ids: torch.Tensor,
|
||||||
|
positions: torch.Tensor,
|
||||||
|
kv_caches: List[KVCache],
|
||||||
|
input_metadata: InputMetadata,
|
||||||
|
cache_events: Optional[List[torch.cuda.Event]],
|
||||||
|
) -> Dict[int, SequenceOutputs]:
|
||||||
|
hidden_states = self.model(input_ids, positions, kv_caches,
|
||||||
|
input_metadata, cache_events)
|
||||||
|
next_tokens = self.sampler(self.lm_head.weight, hidden_states,
|
||||||
|
input_metadata)
|
||||||
|
return next_tokens
|
||||||
|
|
||||||
|
_column_parallel_weights = [
|
||||||
|
"embed_tokens.weight", "lm_head.weight", "qkv_proj.weight",
|
||||||
|
"gate_proj.weight", "up_proj.weight"
|
||||||
|
]
|
||||||
|
_row_parallel_weights = ["o_proj.weight", "down_proj.weight"]
|
||||||
|
|
||||||
|
def load_weights(self,
|
||||||
|
model_name_or_path: str,
|
||||||
|
cache_dir: Optional[str] = None,
|
||||||
|
use_np_cache: bool = False):
|
||||||
|
tensor_model_parallel_world_size = (
|
||||||
|
get_tensor_model_parallel_world_size())
|
||||||
|
tensor_model_parallel_rank = get_tensor_model_parallel_rank()
|
||||||
|
state_dict = self.state_dict()
|
||||||
|
|
||||||
|
for name, loaded_weight in hf_model_weights_iterator(
|
||||||
|
model_name_or_path, cache_dir, use_np_cache):
|
||||||
|
if "rotary_emb.inv_freq" in name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if "embed_tokens" in name or "lm_head" in name:
|
||||||
|
param = state_dict[name]
|
||||||
|
# Consider padding in the vocab size.
|
||||||
|
padded_vocab_size = (param.shape[0] *
|
||||||
|
tensor_model_parallel_world_size)
|
||||||
|
num_extra_rows = padded_vocab_size - self.config.vocab_size
|
||||||
|
extra_rows = torch.empty(num_extra_rows,
|
||||||
|
loaded_weight.shape[1])
|
||||||
|
extra_rows = extra_rows.to(loaded_weight)
|
||||||
|
loaded_weight = torch.cat([loaded_weight, extra_rows], dim=0)
|
||||||
|
|
||||||
|
is_attention_weight = False
|
||||||
|
for stride_id, att_weight_name in enumerate(
|
||||||
|
["q_proj", "k_proj", "v_proj"]):
|
||||||
|
if att_weight_name not in name:
|
||||||
|
continue
|
||||||
|
param = state_dict[name.replace(att_weight_name, "qkv_proj")]
|
||||||
|
shard_size = param.shape[0] // 3
|
||||||
|
loaded_weight = loaded_weight[
|
||||||
|
shard_size * tensor_model_parallel_rank:shard_size *
|
||||||
|
(tensor_model_parallel_rank + 1)]
|
||||||
|
param_slice = param.data[shard_size * stride_id:shard_size *
|
||||||
|
(stride_id + 1)]
|
||||||
|
assert param_slice.shape == loaded_weight.shape
|
||||||
|
param_slice.copy_(loaded_weight)
|
||||||
|
is_attention_weight = True
|
||||||
|
break
|
||||||
|
if is_attention_weight:
|
||||||
|
continue
|
||||||
|
|
||||||
|
is_gate_up_weight = False
|
||||||
|
for stride_id, weight_name in enumerate(["gate_proj", "up_proj"]):
|
||||||
|
if weight_name not in name:
|
||||||
|
continue
|
||||||
|
param = state_dict[name.replace(weight_name, "gate_up_proj")]
|
||||||
|
shard_size = param.shape[0] // 2
|
||||||
|
loaded_weight = loaded_weight[
|
||||||
|
shard_size * tensor_model_parallel_rank:shard_size *
|
||||||
|
(tensor_model_parallel_rank + 1)]
|
||||||
|
param_slice = param.data[shard_size * stride_id:shard_size *
|
||||||
|
(stride_id + 1)]
|
||||||
|
assert param_slice.shape == loaded_weight.shape
|
||||||
|
param_slice.copy_(loaded_weight)
|
||||||
|
is_gate_up_weight = True
|
||||||
|
break
|
||||||
|
if is_gate_up_weight:
|
||||||
|
continue
|
||||||
|
|
||||||
|
param = state_dict[name]
|
||||||
|
load_tensor_parallel_weights(param, loaded_weight, name,
|
||||||
|
self._column_parallel_weights,
|
||||||
|
self._row_parallel_weights,
|
||||||
|
tensor_model_parallel_rank)
|
||||||
316
vllm/model_executor/models/qwen.py
Normal file
316
vllm/model_executor/models/qwen.py
Normal file
@@ -0,0 +1,316 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Adapted from
|
||||||
|
# https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
|
||||||
|
# Copyright (c) Alibaba Cloud.
|
||||||
|
# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
|
||||||
|
"""Inference-only QWen model compatible with HuggingFace weights.
|
||||||
|
|
||||||
|
The input of the model is flattened to a 1D tensor of tokens. The model uses
|
||||||
|
InputMetadata to extract the original 2D shape of the input.
|
||||||
|
"""
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
|
||||||
|
from vllm.model_executor.input_metadata import InputMetadata
|
||||||
|
from vllm.model_executor.layers.activation import SiluAndMul
|
||||||
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
|
from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
|
||||||
|
from vllm.model_executor.layers.sampler import Sampler
|
||||||
|
from vllm.model_executor.weight_utils import (
|
||||||
|
hf_model_weights_iterator,
|
||||||
|
load_tensor_parallel_weights,
|
||||||
|
)
|
||||||
|
from vllm.model_executor.parallel_utils.parallel_state import (
|
||||||
|
get_tensor_model_parallel_rank,
|
||||||
|
get_tensor_model_parallel_world_size,
|
||||||
|
)
|
||||||
|
from vllm.model_executor.parallel_utils.tensor_parallel import (
|
||||||
|
VocabParallelEmbedding,
|
||||||
|
ColumnParallelLinear,
|
||||||
|
RowParallelLinear,
|
||||||
|
)
|
||||||
|
from vllm.sequence import SequenceOutputs
|
||||||
|
from vllm.transformers_utils.configs.qwen import QWenConfig
|
||||||
|
|
||||||
|
KVCache = Tuple[torch.Tensor, torch.Tensor]
|
||||||
|
|
||||||
|
|
||||||
|
class QWenMLP(nn.Module):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
hidden_size: int,
|
||||||
|
intermediate_size: int,
|
||||||
|
hidden_act: str = "silu",
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.gate_up_proj = ColumnParallelLinear(
|
||||||
|
hidden_size,
|
||||||
|
2 * intermediate_size,
|
||||||
|
bias=False,
|
||||||
|
gather_output=False,
|
||||||
|
perform_initialization=False,
|
||||||
|
)
|
||||||
|
self.c_proj = RowParallelLinear(
|
||||||
|
intermediate_size,
|
||||||
|
hidden_size,
|
||||||
|
bias=False,
|
||||||
|
input_is_parallel=True,
|
||||||
|
perform_initialization=False,
|
||||||
|
)
|
||||||
|
if hidden_act != "silu":
|
||||||
|
raise ValueError(f"Unsupported activation: {hidden_act}. "
|
||||||
|
"Only silu is supported for now.")
|
||||||
|
self.act_fn = SiluAndMul()
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
gate_up, _ = self.gate_up_proj(x)
|
||||||
|
x = self.act_fn(gate_up)
|
||||||
|
x, _ = self.c_proj(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class QWenAttention(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, hidden_size: int, num_heads: int,
|
||||||
|
max_position_embeddings: int):
|
||||||
|
super().__init__()
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
tensor_model_parallel_world_size = get_tensor_model_parallel_world_size(
|
||||||
|
)
|
||||||
|
self.total_num_heads = num_heads
|
||||||
|
assert self.total_num_heads % tensor_model_parallel_world_size == 0
|
||||||
|
self.num_heads = (self.total_num_heads //
|
||||||
|
tensor_model_parallel_world_size)
|
||||||
|
self.head_dim = hidden_size // self.total_num_heads
|
||||||
|
|
||||||
|
# pylint: disable=invalid-name
|
||||||
|
self.c_attn = ColumnParallelLinear(
|
||||||
|
hidden_size,
|
||||||
|
3 * hidden_size,
|
||||||
|
bias=True,
|
||||||
|
gather_output=False,
|
||||||
|
perform_initialization=False,
|
||||||
|
)
|
||||||
|
self.c_proj = RowParallelLinear(
|
||||||
|
self.total_num_heads * self.head_dim,
|
||||||
|
hidden_size,
|
||||||
|
bias=False,
|
||||||
|
input_is_parallel=True,
|
||||||
|
perform_initialization=False,
|
||||||
|
)
|
||||||
|
self.scaling = self.head_dim**-0.5
|
||||||
|
self.attn = PagedAttentionWithRoPE(
|
||||||
|
self.num_heads,
|
||||||
|
self.head_dim,
|
||||||
|
self.scaling,
|
||||||
|
rotary_dim=self.head_dim,
|
||||||
|
max_position=max_position_embeddings,
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
positions: torch.Tensor,
|
||||||
|
hidden_states: torch.Tensor,
|
||||||
|
kv_cache: KVCache,
|
||||||
|
input_metadata: InputMetadata,
|
||||||
|
cache_event: Optional[torch.cuda.Event],
|
||||||
|
) -> torch.Tensor:
|
||||||
|
qkv, _ = self.c_attn(hidden_states)
|
||||||
|
q, k, v = qkv.chunk(chunks=3, dim=-1)
|
||||||
|
|
||||||
|
k_cache, v_cache = kv_cache
|
||||||
|
attn_output = self.attn(positions, q, k, v, k_cache, v_cache,
|
||||||
|
input_metadata, cache_event)
|
||||||
|
|
||||||
|
output, _ = self.c_proj(attn_output)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
class QWenBlock(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, config: QWenConfig):
|
||||||
|
super().__init__()
|
||||||
|
self.ln_1 = RMSNorm(config.n_embd, eps=config.layer_norm_epsilon)
|
||||||
|
|
||||||
|
self.attn = QWenAttention(config.n_embd, config.num_attention_heads,
|
||||||
|
config.max_position_embeddings)
|
||||||
|
|
||||||
|
self.ln_2 = RMSNorm(config.n_embd, eps=config.layer_norm_epsilon)
|
||||||
|
|
||||||
|
self.mlp = QWenMLP(config.n_embd, config.ffn_hidden_size // 2)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
positions: torch.Tensor,
|
||||||
|
hidden_states: torch.Tensor,
|
||||||
|
kv_cache: KVCache,
|
||||||
|
input_metadata: InputMetadata,
|
||||||
|
cache_event: Optional[torch.cuda.Event],
|
||||||
|
) -> torch.Tensor:
|
||||||
|
# Self Attention
|
||||||
|
residual = hidden_states
|
||||||
|
hidden_states = self.ln_1(hidden_states)
|
||||||
|
hidden_states = self.attn(
|
||||||
|
positions=positions,
|
||||||
|
hidden_states=hidden_states,
|
||||||
|
kv_cache=kv_cache,
|
||||||
|
input_metadata=input_metadata,
|
||||||
|
cache_event=cache_event,
|
||||||
|
)
|
||||||
|
hidden_states = residual + hidden_states
|
||||||
|
|
||||||
|
# Fully Connected
|
||||||
|
residual = hidden_states
|
||||||
|
hidden_states = self.ln_2(hidden_states)
|
||||||
|
hidden_states = self.mlp(hidden_states)
|
||||||
|
hidden_states = residual + hidden_states
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
class QWenModel(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, config: QWenConfig):
|
||||||
|
super().__init__()
|
||||||
|
self.config = config
|
||||||
|
self.vocab_size = config.vocab_size
|
||||||
|
|
||||||
|
vocab_size = ((config.vocab_size + 63) // 64) * 64
|
||||||
|
self.wte = VocabParallelEmbedding(vocab_size,
|
||||||
|
config.n_embd,
|
||||||
|
perform_initialization=False)
|
||||||
|
self.h = nn.ModuleList(
|
||||||
|
[QWenBlock(config) for _ in range(config.num_hidden_layers)])
|
||||||
|
self.ln_f = RMSNorm(config.n_embd, eps=config.layer_norm_epsilon)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
input_ids: torch.Tensor,
|
||||||
|
positions: torch.Tensor,
|
||||||
|
kv_caches: List[KVCache],
|
||||||
|
input_metadata: InputMetadata,
|
||||||
|
cache_events: Optional[List[torch.cuda.Event]],
|
||||||
|
) -> torch.Tensor:
|
||||||
|
hidden_states = self.wte(input_ids)
|
||||||
|
for i in range(len(self.h)):
|
||||||
|
if cache_events is None:
|
||||||
|
cache_event = None
|
||||||
|
else:
|
||||||
|
cache_event = cache_events[i]
|
||||||
|
layer = self.h[i]
|
||||||
|
hidden_states = layer(
|
||||||
|
positions,
|
||||||
|
hidden_states,
|
||||||
|
kv_caches[i],
|
||||||
|
input_metadata,
|
||||||
|
cache_event,
|
||||||
|
)
|
||||||
|
hidden_states = self.ln_f(hidden_states)
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
class QWenLMHeadModel(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, config: QWenConfig):
|
||||||
|
super().__init__()
|
||||||
|
self.config = config
|
||||||
|
self.transformer = QWenModel(config)
|
||||||
|
vocab_size = ((config.vocab_size + 63) // 64) * 64
|
||||||
|
self.lm_head = ColumnParallelLinear(
|
||||||
|
config.n_embd,
|
||||||
|
vocab_size,
|
||||||
|
bias=False,
|
||||||
|
gather_output=False,
|
||||||
|
perform_initialization=False,
|
||||||
|
)
|
||||||
|
self.sampler = Sampler(config.vocab_size)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
input_ids: torch.Tensor,
|
||||||
|
positions: torch.Tensor,
|
||||||
|
kv_caches: List[KVCache],
|
||||||
|
input_metadata: InputMetadata,
|
||||||
|
cache_events: Optional[List[torch.cuda.Event]],
|
||||||
|
) -> Dict[int, SequenceOutputs]:
|
||||||
|
hidden_states = self.transformer(input_ids, positions, kv_caches,
|
||||||
|
input_metadata, cache_events)
|
||||||
|
next_tokens = self.sampler(self.lm_head.weight, hidden_states,
|
||||||
|
input_metadata)
|
||||||
|
return next_tokens
|
||||||
|
|
||||||
|
_column_parallel_weights = ["wte.weight", "lm_head.weight"]
|
||||||
|
_row_parallel_weights = ["c_proj.weight"]
|
||||||
|
|
||||||
|
def load_weights(
|
||||||
|
self,
|
||||||
|
model_name_or_path: str,
|
||||||
|
cache_dir: Optional[str] = None,
|
||||||
|
use_np_cache: bool = False,
|
||||||
|
):
|
||||||
|
tp_world_size = get_tensor_model_parallel_world_size()
|
||||||
|
tp_rank = get_tensor_model_parallel_rank()
|
||||||
|
state_dict = self.state_dict()
|
||||||
|
|
||||||
|
for name, loaded_weight in hf_model_weights_iterator(
|
||||||
|
model_name_or_path, cache_dir, use_np_cache):
|
||||||
|
if "rotary_emb.inv_freq" in name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if "wte" in name or "lm_head" in name:
|
||||||
|
# Consider padding in the vocab size.
|
||||||
|
param = state_dict[name]
|
||||||
|
padded_vocab_size = param.shape[0] * tp_world_size
|
||||||
|
num_extra_rows = padded_vocab_size - self.config.vocab_size
|
||||||
|
extra_rows = torch.empty(num_extra_rows,
|
||||||
|
loaded_weight.shape[1])
|
||||||
|
extra_rows = extra_rows.to(loaded_weight)
|
||||||
|
loaded_weight = torch.cat([loaded_weight, extra_rows], dim=0)
|
||||||
|
|
||||||
|
if "c_attn" in name:
|
||||||
|
total_num_heads = self.config.num_attention_heads
|
||||||
|
hidden_size = self.config.hidden_size
|
||||||
|
head_size = hidden_size // total_num_heads
|
||||||
|
num_heads = total_num_heads // tp_world_size
|
||||||
|
head_start = tp_rank * num_heads
|
||||||
|
head_end = (tp_rank + 1) * num_heads
|
||||||
|
|
||||||
|
if "weight" in name:
|
||||||
|
loaded_weight = loaded_weight.view(3, total_num_heads,
|
||||||
|
head_size, hidden_size)
|
||||||
|
loaded_weight = loaded_weight[:, head_start:head_end, :, :]
|
||||||
|
loaded_weight = loaded_weight.reshape(-1, hidden_size)
|
||||||
|
elif "bias" in name:
|
||||||
|
loaded_weight = loaded_weight.view(3, total_num_heads,
|
||||||
|
head_size)
|
||||||
|
loaded_weight = loaded_weight[:, head_start:head_end, :]
|
||||||
|
loaded_weight = loaded_weight.reshape(-1)
|
||||||
|
|
||||||
|
is_gate_up_weight = False
|
||||||
|
for stride_id, weight_name in enumerate(["w2", "w1"]):
|
||||||
|
if weight_name not in name:
|
||||||
|
continue
|
||||||
|
param = state_dict[name.replace(weight_name, "gate_up_proj")]
|
||||||
|
shard_size = param.shape[0] // 2
|
||||||
|
loaded_weight = loaded_weight[shard_size * tp_rank:shard_size *
|
||||||
|
(tp_rank + 1)]
|
||||||
|
param_slice = param.data[shard_size * stride_id:shard_size *
|
||||||
|
(stride_id + 1)]
|
||||||
|
assert param_slice.shape == loaded_weight.shape
|
||||||
|
param_slice.copy_(loaded_weight)
|
||||||
|
is_gate_up_weight = True
|
||||||
|
break
|
||||||
|
if is_gate_up_weight:
|
||||||
|
continue
|
||||||
|
|
||||||
|
param = state_dict[name]
|
||||||
|
load_tensor_parallel_weights(
|
||||||
|
param,
|
||||||
|
loaded_weight,
|
||||||
|
name,
|
||||||
|
self._column_parallel_weights,
|
||||||
|
self._row_parallel_weights,
|
||||||
|
tp_rank,
|
||||||
|
)
|
||||||
@@ -76,6 +76,8 @@ def hf_model_weights_iterator(
|
|||||||
state = torch.load(bin_file, map_location="cpu")
|
state = torch.load(bin_file, map_location="cpu")
|
||||||
for name, param in state.items():
|
for name, param in state.items():
|
||||||
yield name, param
|
yield name, param
|
||||||
|
del state
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
|
||||||
def load_tensor_parallel_weights(
|
def load_tensor_parallel_weights(
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ class SamplingParams:
|
|||||||
|
|
||||||
self._verify_args()
|
self._verify_args()
|
||||||
if self.use_beam_search:
|
if self.use_beam_search:
|
||||||
self._verity_beam_search()
|
self._verify_beam_search()
|
||||||
elif self.temperature < _SAMPLING_EPS:
|
elif self.temperature < _SAMPLING_EPS:
|
||||||
# Zero temperature means greedy sampling.
|
# Zero temperature means greedy sampling.
|
||||||
self._verify_greedy_sampling()
|
self._verify_greedy_sampling()
|
||||||
@@ -109,7 +109,7 @@ class SamplingParams:
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"logprobs must be non-negative, got {self.logprobs}.")
|
f"logprobs must be non-negative, got {self.logprobs}.")
|
||||||
|
|
||||||
def _verity_beam_search(self) -> None:
|
def _verify_beam_search(self) -> None:
|
||||||
if self.best_of == 1:
|
if self.best_of == 1:
|
||||||
raise ValueError("best_of must be greater than 1 when using beam "
|
raise ValueError("best_of must be greater than 1 when using beam "
|
||||||
f"search. Got {self.best_of}.")
|
f"search. Got {self.best_of}.")
|
||||||
|
|||||||
@@ -5,6 +5,8 @@ from vllm.transformers_utils.configs import * # pylint: disable=wildcard-import
|
|||||||
_CONFIG_REGISTRY = {
|
_CONFIG_REGISTRY = {
|
||||||
"mpt": MPTConfig,
|
"mpt": MPTConfig,
|
||||||
"baichuan": BaiChuanConfig,
|
"baichuan": BaiChuanConfig,
|
||||||
|
"aquila": AquilaConfig,
|
||||||
|
"qwen": QWenConfig,
|
||||||
"RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct)
|
"RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct)
|
||||||
"RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct)
|
"RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
from vllm.transformers_utils.configs.mpt import MPTConfig
|
from vllm.transformers_utils.configs.mpt import MPTConfig
|
||||||
from vllm.transformers_utils.configs.baichuan import BaiChuanConfig
|
from vllm.transformers_utils.configs.baichuan import BaiChuanConfig
|
||||||
|
from vllm.transformers_utils.configs.aquila import AquilaConfig
|
||||||
|
from vllm.transformers_utils.configs.qwen import QWenConfig
|
||||||
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
|
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
|
||||||
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
|
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
|
||||||
# `FalconConfig` class from the official HuggingFace transformers library.
|
# `FalconConfig` class from the official HuggingFace transformers library.
|
||||||
@@ -8,5 +10,7 @@ from vllm.transformers_utils.configs.falcon import RWConfig
|
|||||||
__all__ = [
|
__all__ = [
|
||||||
"MPTConfig",
|
"MPTConfig",
|
||||||
"BaiChuanConfig",
|
"BaiChuanConfig",
|
||||||
|
"AquilaConfig",
|
||||||
|
"QWenConfig",
|
||||||
"RWConfig",
|
"RWConfig",
|
||||||
]
|
]
|
||||||
|
|||||||
63
vllm/transformers_utils/configs/aquila.py
Normal file
63
vllm/transformers_utils/configs/aquila.py
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
||||||
|
#
|
||||||
|
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||||
|
# and OPT implementations in this library. It has been modified from its
|
||||||
|
# original forms to accommodate minor architectural differences compared
|
||||||
|
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" Aquila model configuration"""
|
||||||
|
|
||||||
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
|
|
||||||
|
class AquilaConfig(PretrainedConfig):
|
||||||
|
model_type = "aquila"
|
||||||
|
keys_to_ignore_at_inference = ["past_key_values"]
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab_size=100008,
|
||||||
|
hidden_size=4096,
|
||||||
|
intermediate_size=11008,
|
||||||
|
num_hidden_layers=32,
|
||||||
|
num_attention_heads=32,
|
||||||
|
hidden_act="silu",
|
||||||
|
max_position_embeddings=2048,
|
||||||
|
initializer_range=0.006,
|
||||||
|
rms_norm_eps=1e-5,
|
||||||
|
use_cache=True,
|
||||||
|
pad_token_id=0,
|
||||||
|
bos_token_id=1,
|
||||||
|
eos_token_id=2,
|
||||||
|
tie_word_embeddings=False,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.intermediate_size = intermediate_size
|
||||||
|
self.num_hidden_layers = num_hidden_layers
|
||||||
|
self.num_attention_heads = num_attention_heads
|
||||||
|
self.hidden_act = hidden_act
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.rms_norm_eps = rms_norm_eps
|
||||||
|
self.use_cache = use_cache
|
||||||
|
super().__init__(
|
||||||
|
pad_token_id=pad_token_id,
|
||||||
|
bos_token_id=bos_token_id,
|
||||||
|
eos_token_id=eos_token_id,
|
||||||
|
tie_word_embeddings=tie_word_embeddings,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
71
vllm/transformers_utils/configs/qwen.py
Normal file
71
vllm/transformers_utils/configs/qwen.py
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
# Copyright (c) Alibaba Cloud.
|
||||||
|
# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
|
||||||
|
|
||||||
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
|
|
||||||
|
class QWenConfig(PretrainedConfig):
|
||||||
|
model_type = "qwen"
|
||||||
|
keys_to_ignore_at_inference = ["past_key_values"]
|
||||||
|
attribute_map = {
|
||||||
|
"hidden_size": "n_embd",
|
||||||
|
"num_attention_heads": "n_head",
|
||||||
|
"max_position_embeddings": "n_positions",
|
||||||
|
"num_hidden_layers": "n_layer",
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab_size=151851,
|
||||||
|
n_embd=4096,
|
||||||
|
n_layer=32,
|
||||||
|
n_head=32,
|
||||||
|
n_inner=None,
|
||||||
|
embd_pdrop=0.0,
|
||||||
|
attn_pdrop=0.0,
|
||||||
|
layer_norm_epsilon=1e-5,
|
||||||
|
initializer_range=0.02,
|
||||||
|
scale_attn_weights=True,
|
||||||
|
use_cache=True,
|
||||||
|
eos_token_id=151643,
|
||||||
|
apply_residual_connection_post_layernorm=False,
|
||||||
|
bf16=True,
|
||||||
|
kv_channels=128,
|
||||||
|
rotary_pct=1.0,
|
||||||
|
rotary_emb_base=10000,
|
||||||
|
use_dynamic_ntk=False,
|
||||||
|
use_logn_attn=False,
|
||||||
|
use_flash_attn=True,
|
||||||
|
ffn_hidden_size=22016,
|
||||||
|
no_bias=True,
|
||||||
|
tie_word_embeddings=False,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
self.eos_token_id = eos_token_id
|
||||||
|
super().__init__(eos_token_id=eos_token_id,
|
||||||
|
tie_word_embeddings=tie_word_embeddings,
|
||||||
|
**kwargs)
|
||||||
|
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.n_embd = n_embd
|
||||||
|
self.n_layer = n_layer
|
||||||
|
self.n_head = n_head
|
||||||
|
self.n_inner = n_inner
|
||||||
|
self.embd_pdrop = embd_pdrop
|
||||||
|
self.attn_pdrop = attn_pdrop
|
||||||
|
self.layer_norm_epsilon = layer_norm_epsilon
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.scale_attn_weights = scale_attn_weights
|
||||||
|
self.use_cache = use_cache
|
||||||
|
self.apply_residual_connection_post_layernorm = (
|
||||||
|
apply_residual_connection_post_layernorm)
|
||||||
|
self.bf16 = bf16
|
||||||
|
self.kv_channels = kv_channels
|
||||||
|
self.rotary_pct = rotary_pct
|
||||||
|
self.rotary_emb_base = rotary_emb_base
|
||||||
|
self.use_dynamic_ntk = use_dynamic_ntk
|
||||||
|
self.use_logn_attn = use_logn_attn
|
||||||
|
self.use_flash_attn = use_flash_attn
|
||||||
|
self.ffn_hidden_size = ffn_hidden_size
|
||||||
|
self.no_bias = no_bias
|
||||||
|
self.tie_word_embeddings = tie_word_embeddings
|
||||||
@@ -72,7 +72,7 @@ def detokenize_incrementally(
|
|||||||
new_token_id: int,
|
new_token_id: int,
|
||||||
skip_special_tokens: bool,
|
skip_special_tokens: bool,
|
||||||
) -> Tuple[str, str]:
|
) -> Tuple[str, str]:
|
||||||
"""Detokenizes the new token in conjuction with the previous output tokens.
|
"""Detokenizes the new token in conjunction with the previous output tokens.
|
||||||
|
|
||||||
NOTE: This function does not update prev_output_tokens.
|
NOTE: This function does not update prev_output_tokens.
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user