Compare commits
40 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a5dd03c1eb | ||
|
|
c18b3b8e8b | ||
|
|
9528e3a05e | ||
|
|
9fb52e523a | ||
|
|
e202dd2736 | ||
|
|
43813e6361 | ||
|
|
cede942b87 | ||
|
|
fe1e924811 | ||
|
|
4548c03c50 | ||
|
|
40b86aa05e | ||
|
|
432870829d | ||
|
|
f73d02aadc | ||
|
|
c5ebe040ac | ||
|
|
8d763cb891 | ||
|
|
cf4cd53982 | ||
|
|
32c9be2200 | ||
|
|
8aeaa910a2 | ||
|
|
906e05d840 | ||
|
|
ef9a2990ae | ||
|
|
7e90870491 | ||
|
|
d3f05c9248 | ||
|
|
c108781c85 | ||
|
|
3d184b95b8 | ||
|
|
2f35a022e6 | ||
|
|
ffe00ef77a | ||
|
|
5561681d04 | ||
|
|
fbd62d8750 | ||
|
|
2e26f9156a | ||
|
|
9e5452ee34 | ||
|
|
0e3fe896e2 | ||
|
|
1caca5a589 | ||
|
|
783921d889 | ||
|
|
4a98edff1f | ||
|
|
a7bab0c9e5 | ||
|
|
25950dca9b | ||
|
|
a4113b035c | ||
|
|
7e1665b089 | ||
|
|
8d1096e7db | ||
|
|
8d775dd30a | ||
|
|
78fe77534b |
@@ -52,7 +52,7 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
- label: "Annotate release workflow"
|
- label: "Annotate release workflow"
|
||||||
|
|||||||
@@ -107,10 +107,9 @@ fi
|
|||||||
|
|
||||||
if [[ $commands == *" kernels/attention"* ]]; then
|
if [[ $commands == *" kernels/attention"* ]]; then
|
||||||
commands="${commands} \
|
commands="${commands} \
|
||||||
--ignore=kernels/attention/stest_attention_selector.py \
|
--ignore=kernels/attention/test_attention_selector.py \
|
||||||
--ignore=kernels/attention/test_blocksparse_attention.py \
|
--ignore=kernels/attention/test_blocksparse_attention.py \
|
||||||
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
||||||
--ignore=kernels/attention/test_attention_selector.py \
|
|
||||||
--ignore=kernels/attention/test_flash_attn.py \
|
--ignore=kernels/attention/test_flash_attn.py \
|
||||||
--ignore=kernels/attention/test_flashinfer.py \
|
--ignore=kernels/attention/test_flashinfer.py \
|
||||||
--ignore=kernels/attention/test_prefix_prefill.py \
|
--ignore=kernels/attention/test_prefix_prefill.py \
|
||||||
|
|||||||
2
.github/CODEOWNERS
vendored
2
.github/CODEOWNERS
vendored
@@ -16,7 +16,7 @@
|
|||||||
/vllm/lora @jeejeelee
|
/vllm/lora @jeejeelee
|
||||||
/vllm/reasoning @aarnphm
|
/vllm/reasoning @aarnphm
|
||||||
/vllm/entrypoints @aarnphm
|
/vllm/entrypoints @aarnphm
|
||||||
CMakeLists.txt @tlrmchlsmth
|
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||||
|
|
||||||
# Any change to the VllmConfig changes can have a large user-facing impact,
|
# Any change to the VllmConfig changes can have a large user-facing impact,
|
||||||
# so spam a lot of people
|
# so spam a lot of people
|
||||||
|
|||||||
2
.github/workflows/lint-and-deploy.yaml
vendored
2
.github/workflows/lint-and-deploy.yaml
vendored
@@ -68,7 +68,7 @@ jobs:
|
|||||||
export AWS_ACCESS_KEY_ID=minioadmin
|
export AWS_ACCESS_KEY_ID=minioadmin
|
||||||
export AWS_SECRET_ACCESS_KEY=minioadmin
|
export AWS_SECRET_ACCESS_KEY=minioadmin
|
||||||
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
|
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
|
||||||
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set image.env[2].name=VLLM_CPU_CI_ENV --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string image.env[2].value="1" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
||||||
|
|
||||||
- name: curl test
|
- name: curl test
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
@@ -259,7 +259,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||||
|
|
||||||
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
||||||
set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")
|
set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
|
||||||
|
|
||||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||||
@@ -615,6 +615,26 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"in CUDA target architectures.")
|
"in CUDA target architectures.")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${SRCS}"
|
||||||
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
|
||||||
|
message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
|
||||||
|
else()
|
||||||
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
|
message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
|
||||||
|
"not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
|
||||||
|
"if you intend on running FP8 quantized MoE models on Blackwell.")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
|
||||||
|
"in CUDA target architectures")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# Machete kernels
|
# Machete kernels
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import argparse
|
import argparse
|
||||||
import copy
|
import copy
|
||||||
import itertools
|
import itertools
|
||||||
|
|||||||
@@ -620,7 +620,7 @@ def main(args: argparse.Namespace):
|
|||||||
4096,
|
4096,
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
batch_sizes = [args.batch_size]
|
batch_sizes = args.batch_size
|
||||||
|
|
||||||
use_deep_gemm = bool(args.use_deep_gemm)
|
use_deep_gemm = bool(args.use_deep_gemm)
|
||||||
|
|
||||||
@@ -728,7 +728,7 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
parser.add_argument("--use-deep-gemm", action="store_true")
|
parser.add_argument("--use-deep-gemm", action="store_true")
|
||||||
parser.add_argument("--seed", type=int, default=0)
|
parser.add_argument("--seed", type=int, default=0)
|
||||||
parser.add_argument("--batch-size", type=int, required=False)
|
parser.add_argument("--batch-size", type=int, nargs="+", required=False)
|
||||||
parser.add_argument("--tune", action="store_true")
|
parser.add_argument("--tune", action="store_true")
|
||||||
parser.add_argument("--trust-remote-code", action="store_true")
|
parser.add_argument("--trust-remote-code", action="store_true")
|
||||||
parser.add_argument("--model-prefix", type=str, required=False)
|
parser.add_argument("--model-prefix", type=str, required=False)
|
||||||
|
|||||||
@@ -45,7 +45,6 @@
|
|||||||
#include "cute/algorithm/functional.hpp"
|
#include "cute/algorithm/functional.hpp"
|
||||||
#include "cute/atom/mma_atom.hpp"
|
#include "cute/atom/mma_atom.hpp"
|
||||||
#include "cute/algorithm/gemm.hpp"
|
#include "cute/algorithm/gemm.hpp"
|
||||||
#include "cute/tensor_predicate.hpp"
|
|
||||||
#include "cute/numeric/arithmetic_tuple.hpp"
|
#include "cute/numeric/arithmetic_tuple.hpp"
|
||||||
|
|
||||||
#include "cutlass_extensions/gemm/dispatch_policy.hpp"
|
#include "cutlass_extensions/gemm/dispatch_policy.hpp"
|
||||||
|
|||||||
@@ -162,10 +162,11 @@ __global__ void dynamic_scaled_int8_quant_kernel(
|
|||||||
|
|
||||||
// calculate for absmax
|
// calculate for absmax
|
||||||
float thread_max = 0.f;
|
float thread_max = 0.f;
|
||||||
for (int i = tid; i < hidden_size; i += stride) {
|
vectorize_read_with_alignment<16>(
|
||||||
const auto v = fabsf(static_cast<float>(row_in[i]));
|
row_in, hidden_size, tid, stride, [&] __device__(const scalar_t& src) {
|
||||||
thread_max = fmaxf(thread_max, v);
|
const float v = fabsf(static_cast<float>(src));
|
||||||
}
|
thread_max = fmaxf(thread_max, v);
|
||||||
|
});
|
||||||
using BlockReduce = cub::BlockReduce<float, 256>;
|
using BlockReduce = cub::BlockReduce<float, 256>;
|
||||||
__shared__ typename BlockReduce::TempStorage tmp;
|
__shared__ typename BlockReduce::TempStorage tmp;
|
||||||
float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
|
float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
|
||||||
@@ -232,9 +233,10 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
|
|||||||
|
|
||||||
// 1. calculate min & max
|
// 1. calculate min & max
|
||||||
MinMax thread_mm;
|
MinMax thread_mm;
|
||||||
for (int i = tid; i < hidden_size; i += stride) {
|
vectorize_read_with_alignment<16>(row_in, hidden_size, tid, stride,
|
||||||
thread_mm += static_cast<float>(row_in[i]);
|
[&] __device__(const scalar_t& src) {
|
||||||
}
|
thread_mm += static_cast<float>(src);
|
||||||
|
});
|
||||||
|
|
||||||
using BlockReduce = cub::BlockReduce<MinMax, 256>;
|
using BlockReduce = cub::BlockReduce<MinMax, 256>;
|
||||||
__shared__ typename BlockReduce::TempStorage tmp;
|
__shared__ typename BlockReduce::TempStorage tmp;
|
||||||
|
|||||||
@@ -51,7 +51,8 @@ struct cutlass_3x_gemm {
|
|||||||
// These are the minimum alignments needed for the kernels to compile
|
// These are the minimum alignments needed for the kernels to compile
|
||||||
static constexpr int AlignmentAB =
|
static constexpr int AlignmentAB =
|
||||||
128 / cutlass::sizeof_bits<ElementAB>::value;
|
128 / cutlass::sizeof_bits<ElementAB>::value;
|
||||||
static constexpr int AlignmentCD = 4;
|
static constexpr int AlignmentCD =
|
||||||
|
128 / cutlass::sizeof_bits<ElementD>::value;
|
||||||
|
|
||||||
using CollectiveEpilogue =
|
using CollectiveEpilogue =
|
||||||
typename cutlass::epilogue::collective::CollectiveBuilder<
|
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||||
|
|||||||
@@ -0,0 +1,374 @@
|
|||||||
|
#include "core/registration.h"
|
||||||
|
|
||||||
|
#include <torch/all.h>
|
||||||
|
#include <cutlass/arch/arch.h>
|
||||||
|
|
||||||
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
#include <c10/cuda/CUDAStream.h>
|
||||||
|
|
||||||
|
#include "cute/tensor.hpp"
|
||||||
|
#include "cutlass/tensor_ref.h"
|
||||||
|
#include "cutlass/epilogue/collective/default_epilogue.hpp"
|
||||||
|
#include "cutlass/epilogue/thread/linear_combination.h"
|
||||||
|
#include "cutlass/gemm/dispatch_policy.hpp"
|
||||||
|
#include "cutlass/gemm/group_array_problem_shape.hpp"
|
||||||
|
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||||
|
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||||
|
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||||
|
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||||
|
|
||||||
|
#include "cutlass/util/command_line.h"
|
||||||
|
#include "cutlass/util/distribution.h"
|
||||||
|
#include "cutlass/util/host_tensor.h"
|
||||||
|
#include "cutlass/util/packed_stride.hpp"
|
||||||
|
#include "cutlass/util/tensor_view_io.h"
|
||||||
|
#include "cutlass/util/reference/device/gemm.h"
|
||||||
|
#include "cutlass/util/reference/device/tensor_compare.h"
|
||||||
|
#include "cutlass/util/reference/host/tensor_fill.h"
|
||||||
|
#include "cutlass/util/reference/host/gett.hpp"
|
||||||
|
#include "cutlass/util/reference/host/tensor_norm.h"
|
||||||
|
#include "cutlass/util/reference/host/tensor_compare.h"
|
||||||
|
#include <cassert>
|
||||||
|
|
||||||
|
using namespace cute;
|
||||||
|
|
||||||
|
template <typename ElementAB, typename ElementC, typename ElementAccumulator,
|
||||||
|
typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
|
||||||
|
__global__ void get_ggemm_starts(
|
||||||
|
int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
|
||||||
|
ElementC** out_offsets, ElementAccumulator** a_scale_offsets,
|
||||||
|
ElementAccumulator** b_scale_offsets, ElementAB* a_base_as_int,
|
||||||
|
ElementAB* b_base_as_int, ElementC* out_base_as_int,
|
||||||
|
ElementAccumulator* a_scale_base_as_int,
|
||||||
|
ElementAccumulator* b_scale_base_as_int, LayoutSFA* layout_sfa_base_as_int,
|
||||||
|
LayoutSFB* layout_sfb_base_as_int, int* problem_sizes) {
|
||||||
|
int expert_id = threadIdx.x;
|
||||||
|
|
||||||
|
if (expert_id >= gridDim.x * blockDim.x) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int m = problem_sizes[expert_id * 3];
|
||||||
|
int n = problem_sizes[expert_id * 3 + 1];
|
||||||
|
int k = problem_sizes[expert_id * 3 + 2];
|
||||||
|
|
||||||
|
int32_t expert_offset = expert_offsets[expert_id];
|
||||||
|
int a_stride = expert_offset * k;
|
||||||
|
int b_stride = expert_id * k * n;
|
||||||
|
int a_scale_stride = expert_offset * k / 128;
|
||||||
|
int b_scale_stride = expert_id * k * n / 128 / 128;
|
||||||
|
|
||||||
|
a_offsets[expert_id] = a_base_as_int + a_stride;
|
||||||
|
b_offsets[expert_id] = b_base_as_int + b_stride;
|
||||||
|
out_offsets[expert_id] = out_base_as_int + expert_offset * n;
|
||||||
|
a_scale_offsets[expert_id] = a_scale_base_as_int + a_scale_stride;
|
||||||
|
b_scale_offsets[expert_id] = b_scale_base_as_int + b_scale_stride;
|
||||||
|
|
||||||
|
LayoutSFA* layout_sfa_ptr = layout_sfa_base_as_int + expert_id;
|
||||||
|
LayoutSFB* layout_sfb_ptr = layout_sfb_base_as_int + expert_id;
|
||||||
|
|
||||||
|
*layout_sfa_ptr =
|
||||||
|
ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(m, n, k, 1));
|
||||||
|
*layout_sfb_ptr =
|
||||||
|
ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(m, n, k, 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE, LayoutSFA, LayoutSFB, \
|
||||||
|
ScaleConfig) \
|
||||||
|
else if (out_tensors.dtype() == TENSOR_C_TYPE) { \
|
||||||
|
get_ggemm_starts<cutlass::float_e4m3_t, C_TYPE, float, LayoutSFA, \
|
||||||
|
LayoutSFB, ScaleConfig><<<1, num_experts, 0, stream>>>( \
|
||||||
|
static_cast<int32_t*>(expert_offsets.data_ptr()), \
|
||||||
|
static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()), \
|
||||||
|
static_cast<cutlass::float_e4m3_t**>(b_ptrs.data_ptr()), \
|
||||||
|
static_cast<C_TYPE**>(out_ptrs.data_ptr()), \
|
||||||
|
static_cast<float**>(a_scales_ptrs.data_ptr()), \
|
||||||
|
static_cast<float**>(b_scales_ptrs.data_ptr()), \
|
||||||
|
static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()), \
|
||||||
|
static_cast<cutlass::float_e4m3_t*>(b_tensors.data_ptr()), \
|
||||||
|
static_cast<C_TYPE*>(out_tensors.data_ptr()), \
|
||||||
|
static_cast<float*>(a_scales.data_ptr()), \
|
||||||
|
static_cast<float*>(b_scales.data_ptr()), \
|
||||||
|
reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()), \
|
||||||
|
reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr()), \
|
||||||
|
static_cast<int*>(problem_sizes.data_ptr())); \
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
|
||||||
|
void run_get_ggemm_starts(
|
||||||
|
torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
|
||||||
|
torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
|
||||||
|
torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
|
||||||
|
torch::Tensor const& a_tensors, torch::Tensor const& b_tensors,
|
||||||
|
torch::Tensor out_tensors, torch::Tensor const& a_scales,
|
||||||
|
torch::Tensor const& b_scales, torch::Tensor const& layout_sfa,
|
||||||
|
torch::Tensor const& layout_sfb, torch::Tensor const& problem_sizes) {
|
||||||
|
TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
|
||||||
|
TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
|
||||||
|
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||||
|
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||||
|
TORCH_CHECK(out_tensors.size(1) % 128 == 0 or out_tensors.size(0) % 128 == 0);
|
||||||
|
TORCH_CHECK(a_tensors.size(1) % 128 == 0 or a_tensors.size(0) % 128 == 0);
|
||||||
|
|
||||||
|
int num_experts = (int)expert_offsets.size(0);
|
||||||
|
auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
|
||||||
|
|
||||||
|
if (false) {
|
||||||
|
}
|
||||||
|
__CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t, LayoutSFA,
|
||||||
|
LayoutSFB, ScaleConfig)
|
||||||
|
__CALL_GET_STARTS_KERNEL(torch::kFloat16, cutlass::half_t, LayoutSFA,
|
||||||
|
LayoutSFB, ScaleConfig)
|
||||||
|
else {
|
||||||
|
TORCH_CHECK(false, "Unsupported output tensor type");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename OutType, typename ScheduleConfig, typename LayoutD>
|
||||||
|
void run_blockwise_scaled_group_mm(
|
||||||
|
torch::Tensor& out_ptrs, const torch::Tensor& a_ptrs,
|
||||||
|
const torch::Tensor& b_ptrs, const torch::Tensor& a_scales_ptrs,
|
||||||
|
const torch::Tensor& b_scales_ptrs, const torch::Tensor& stride_a,
|
||||||
|
const torch::Tensor& stride_b, const torch::Tensor& stride_c,
|
||||||
|
const torch::Tensor& layout_sfa, const torch::Tensor& layout_sfb,
|
||||||
|
const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
|
||||||
|
using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;
|
||||||
|
|
||||||
|
// Types
|
||||||
|
using ElementA = cutlass::float_e4m3_t;
|
||||||
|
using ElementB = cutlass::float_e4m3_t;
|
||||||
|
using ElementC = OutType;
|
||||||
|
using ElementD = ElementC;
|
||||||
|
using ElementAccumulator = float;
|
||||||
|
using LayoutA = cutlass::layout::RowMajor;
|
||||||
|
using LayoutB = cutlass::layout::ColumnMajor;
|
||||||
|
using LayoutC = LayoutD;
|
||||||
|
|
||||||
|
// Alignments
|
||||||
|
static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
|
||||||
|
static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
|
||||||
|
static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
|
||||||
|
|
||||||
|
using ArchTag = cutlass::arch::Sm100;
|
||||||
|
using OperatorClass = cutlass::arch::OpClassTensorOp;
|
||||||
|
|
||||||
|
using CollectiveEpilogue =
|
||||||
|
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||||
|
ArchTag, OperatorClass, typename ScheduleConfig::MmaTileShape,
|
||||||
|
typename ScheduleConfig::ClusterShape,
|
||||||
|
cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
|
||||||
|
ElementAccumulator, void, LayoutC*, AlignmentC, ElementD, LayoutC*,
|
||||||
|
AlignmentC, typename ScheduleConfig::EpilogueSchedule>::CollectiveOp;
|
||||||
|
|
||||||
|
using CollectiveMainloop =
|
||||||
|
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||||
|
ArchTag, OperatorClass, ElementA,
|
||||||
|
cute::tuple<LayoutA*, typename ScheduleConfig::LayoutSFA*>,
|
||||||
|
AlignmentA, ElementB,
|
||||||
|
cute::tuple<LayoutB*, typename ScheduleConfig::LayoutSFB*>,
|
||||||
|
AlignmentB, ElementAccumulator, typename ScheduleConfig::MmaTileShape,
|
||||||
|
typename ScheduleConfig::ClusterShape,
|
||||||
|
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
|
||||||
|
sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||||
|
typename ScheduleConfig::KernelSchedule>::CollectiveOp;
|
||||||
|
|
||||||
|
using GemmKernel =
|
||||||
|
cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
|
||||||
|
CollectiveEpilogue, void>;
|
||||||
|
|
||||||
|
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||||
|
using StrideA = typename Gemm::GemmKernel::InternalStrideA;
|
||||||
|
using StrideB = typename Gemm::GemmKernel::InternalStrideB;
|
||||||
|
using StrideC = typename Gemm::GemmKernel::InternalStrideC;
|
||||||
|
using StrideD = typename Gemm::GemmKernel::InternalStrideD;
|
||||||
|
|
||||||
|
using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
|
||||||
|
int num_experts = (int)expert_offsets.size(0);
|
||||||
|
|
||||||
|
Gemm gemm_op;
|
||||||
|
|
||||||
|
// Mainloop Arguments
|
||||||
|
typename GemmKernel::MainloopArguments mainloop_args{
|
||||||
|
static_cast<const ElementA**>(a_ptrs.data_ptr()),
|
||||||
|
static_cast<StrideA*>(stride_a.data_ptr()),
|
||||||
|
static_cast<const ElementB**>(b_ptrs.data_ptr()),
|
||||||
|
static_cast<StrideB*>(stride_b.data_ptr()),
|
||||||
|
static_cast<const ElementAccumulator**>(a_scales_ptrs.data_ptr()),
|
||||||
|
reinterpret_cast<typename ScheduleConfig::LayoutSFA*>(
|
||||||
|
layout_sfa.data_ptr()),
|
||||||
|
static_cast<const ElementAccumulator**>(b_scales_ptrs.data_ptr()),
|
||||||
|
reinterpret_cast<typename ScheduleConfig::LayoutSFB*>(
|
||||||
|
layout_sfb.data_ptr())};
|
||||||
|
|
||||||
|
cutlass::KernelHardwareInfo hw_info;
|
||||||
|
hw_info.device_id = a_ptrs.get_device();
|
||||||
|
hw_info.sm_count =
|
||||||
|
cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
|
||||||
|
hw_info.device_id);
|
||||||
|
|
||||||
|
// Epilogue Arguments
|
||||||
|
typename GemmKernel::EpilogueArguments epilogue_args{
|
||||||
|
{}, // epilogue.thread
|
||||||
|
nullptr,
|
||||||
|
static_cast<StrideC*>(stride_c.data_ptr()),
|
||||||
|
static_cast<ElementD**>(out_ptrs.data_ptr()),
|
||||||
|
static_cast<StrideC*>(stride_c.data_ptr())};
|
||||||
|
|
||||||
|
UnderlyingProblemShape* problem_sizes_as_shapes =
|
||||||
|
static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
|
||||||
|
|
||||||
|
// Gemm Arguments
|
||||||
|
typename GemmKernel::Arguments args{
|
||||||
|
cutlass::gemm::GemmUniversalMode::kGrouped,
|
||||||
|
{num_experts, problem_sizes_as_shapes, nullptr},
|
||||||
|
mainloop_args,
|
||||||
|
epilogue_args,
|
||||||
|
hw_info};
|
||||||
|
|
||||||
|
at::cuda::CUDAGuard device_guard{(char)a_ptrs.device().index()};
|
||||||
|
const cudaStream_t stream =
|
||||||
|
at::cuda::getCurrentCUDAStream(a_ptrs.get_device());
|
||||||
|
|
||||||
|
auto can_implement_status = gemm_op.can_implement(args);
|
||||||
|
TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
|
||||||
|
"Failed to implement GEMM");
|
||||||
|
|
||||||
|
size_t workspace_size = gemm_op.get_workspace_size(args);
|
||||||
|
auto const workspace_options =
|
||||||
|
torch::TensorOptions().dtype(torch::kUInt8).device(a_ptrs.device());
|
||||||
|
auto workspace = torch::empty(workspace_size, workspace_options);
|
||||||
|
|
||||||
|
auto status = gemm_op.initialize(args, workspace.data_ptr(), stream);
|
||||||
|
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
|
||||||
|
|
||||||
|
status = gemm_op.run(stream);
|
||||||
|
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename OutType>
|
||||||
|
void blockwise_scaled_group_mm_dispatch_shape(
|
||||||
|
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
|
||||||
|
const torch::Tensor& scales_a, const torch::Tensor& scales_b,
|
||||||
|
const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
|
||||||
|
struct MmaConfig {
|
||||||
|
using ElementA = cutlass::float_e4m3_t;
|
||||||
|
using KernelSchedule =
|
||||||
|
cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100;
|
||||||
|
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||||
|
using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
|
||||||
|
1, 128, 128, cute::UMMA::Major::K, cute::UMMA::Major::K>;
|
||||||
|
using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
|
||||||
|
using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
|
||||||
|
using LayoutC = cutlass::layout::RowMajor;
|
||||||
|
using MmaTileShape = Shape<_128, _128, _128>;
|
||||||
|
using ClusterShape = Shape<_1, _1, _1>;
|
||||||
|
};
|
||||||
|
|
||||||
|
int num_experts = (int)expert_offsets.size(0);
|
||||||
|
|
||||||
|
auto a_ptrs = torch::empty(
|
||||||
|
{num_experts},
|
||||||
|
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||||
|
auto b_ptrs = torch::empty(
|
||||||
|
{num_experts},
|
||||||
|
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||||
|
auto out_ptrs = torch::empty(
|
||||||
|
{num_experts},
|
||||||
|
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||||
|
auto a_scales_ptrs = torch::empty(
|
||||||
|
{num_experts},
|
||||||
|
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||||
|
auto b_scales_ptrs = torch::empty(
|
||||||
|
{num_experts},
|
||||||
|
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||||
|
|
||||||
|
auto layout_sfa = torch::empty(
|
||||||
|
{num_experts, 5},
|
||||||
|
torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
|
||||||
|
auto layout_sfb = torch::empty(
|
||||||
|
{num_experts, 5},
|
||||||
|
torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
|
||||||
|
|
||||||
|
auto stride_a = torch::full(
|
||||||
|
{num_experts}, a.size(1),
|
||||||
|
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||||
|
auto stride_b = torch::full(
|
||||||
|
{num_experts}, a.size(1),
|
||||||
|
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||||
|
auto stride_c = torch::full(
|
||||||
|
{num_experts}, output.size(1),
|
||||||
|
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||||
|
|
||||||
|
torch::TensorOptions options_int =
|
||||||
|
torch::TensorOptions().dtype(torch::kInt64).device(a.device());
|
||||||
|
|
||||||
|
run_get_ggemm_starts<typename MmaConfig::LayoutSFA,
|
||||||
|
typename MmaConfig::LayoutSFB,
|
||||||
|
typename MmaConfig::ScaleConfig>(
|
||||||
|
expert_offsets, a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, a,
|
||||||
|
b, output, scales_a, scales_b, layout_sfa, layout_sfb, problem_sizes);
|
||||||
|
|
||||||
|
run_blockwise_scaled_group_mm<OutType, MmaConfig,
|
||||||
|
typename MmaConfig::LayoutC>(
|
||||||
|
out_ptrs, a_ptrs, b_ptrs, a_scales_ptrs, b_scales_ptrs, stride_a,
|
||||||
|
stride_b, stride_c, layout_sfa, layout_sfb, problem_sizes,
|
||||||
|
expert_offsets);
|
||||||
|
}
|
||||||
|
|
||||||
|
void cutlass_blockwise_scaled_grouped_mm(
|
||||||
|
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
|
||||||
|
const torch::Tensor& scales_a, const torch::Tensor& scales_b,
|
||||||
|
const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
|
||||||
|
TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
|
||||||
|
TORCH_CHECK(problem_sizes.size(1) == 3,
|
||||||
|
"problem_sizes must have shape (num_experts, 3)");
|
||||||
|
TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
|
||||||
|
"Number of experts in problem_sizes must match expert_offsets");
|
||||||
|
TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
|
||||||
|
"problem_sizes must be int32");
|
||||||
|
TORCH_CHECK(a.scalar_type() == torch::kFloat8_e4m3fn,
|
||||||
|
"a must be kFloat8_e4m3fn");
|
||||||
|
TORCH_CHECK(b.scalar_type() == torch::kFloat8_e4m3fn,
|
||||||
|
"b must be kFloat8_e4m3fn");
|
||||||
|
TORCH_CHECK(output.scalar_type() == torch::kBFloat16 ||
|
||||||
|
output.scalar_type() == torch::kHalf,
|
||||||
|
"output must be bfloat16 or half");
|
||||||
|
TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32,
|
||||||
|
"scales_a must be float32");
|
||||||
|
TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32,
|
||||||
|
"scales_b must be float32");
|
||||||
|
TORCH_CHECK(expert_offsets.scalar_type() == torch::kInt32,
|
||||||
|
"expert_offsets must be int32");
|
||||||
|
|
||||||
|
TORCH_CHECK(output.dim() == 2, "output must be 2D tensor");
|
||||||
|
TORCH_CHECK(a.dim() == 2, "a must be 2D tensor");
|
||||||
|
TORCH_CHECK(b.dim() == 3, "b must be 3D tensor");
|
||||||
|
TORCH_CHECK(scales_a.dim() == 2, "scales_a must be 2D tensor");
|
||||||
|
TORCH_CHECK(scales_b.dim() == 3, "scales_b must be 3D tensor");
|
||||||
|
TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
|
||||||
|
TORCH_CHECK(problem_sizes.size(1) == 3,
|
||||||
|
"problem_sizes must have shape (num_experts, 3)");
|
||||||
|
TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
|
||||||
|
"Number of experts in problem_sizes must match expert_offsets");
|
||||||
|
TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
|
||||||
|
"problem_sizes must be int32");
|
||||||
|
TORCH_CHECK(expert_offsets.dim() == 1, "expert_offsets must be 1D tensor");
|
||||||
|
|
||||||
|
#if defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100
|
||||||
|
if (output.scalar_type() == torch::kBFloat16) {
|
||||||
|
blockwise_scaled_group_mm_dispatch_shape<cutlass::bfloat16_t>(
|
||||||
|
output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
|
||||||
|
} else if (output.scalar_type() == torch::kFloat16) {
|
||||||
|
blockwise_scaled_group_mm_dispatch_shape<cutlass::half_t>(
|
||||||
|
output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
|
||||||
|
} else {
|
||||||
|
TORCH_CHECK(false, "Unsupported output tensor type");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||||
|
m.impl("cutlass_blockwise_scaled_grouped_mm",
|
||||||
|
&cutlass_blockwise_scaled_grouped_mm);
|
||||||
|
}
|
||||||
@@ -38,7 +38,6 @@
|
|||||||
#include "cute/atom/mma_atom.hpp"
|
#include "cute/atom/mma_atom.hpp"
|
||||||
#include "cute/atom/copy_traits_sm90_tma.hpp"
|
#include "cute/atom/copy_traits_sm90_tma.hpp"
|
||||||
#include "cute/algorithm/gemm.hpp"
|
#include "cute/algorithm/gemm.hpp"
|
||||||
#include "cute/tensor_predicate.hpp"
|
|
||||||
#include "cute/numeric/arithmetic_tuple.hpp"
|
#include "cute/numeric/arithmetic_tuple.hpp"
|
||||||
#include "cutlass/pipeline/pipeline.hpp"
|
#include "cutlass/pipeline/pipeline.hpp"
|
||||||
#include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
|
#include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
|
||||||
|
|||||||
@@ -27,6 +27,26 @@ __device__ inline void vectorize_with_alignment(
|
|||||||
constexpr int WIDTH = VEC_SIZE * sizeof(InT); // eg: 64 B
|
constexpr int WIDTH = VEC_SIZE * sizeof(InT); // eg: 64 B
|
||||||
uintptr_t addr = reinterpret_cast<uintptr_t>(in);
|
uintptr_t addr = reinterpret_cast<uintptr_t>(in);
|
||||||
|
|
||||||
|
// fast path when the whole region is already aligned
|
||||||
|
// Note: currently the output is guaranteed to be same as the input, so we
|
||||||
|
// don't check it here, comments here just for future reference.
|
||||||
|
bool can_vec = ((addr & (WIDTH - 1)) == 0) && ((len & (VEC_SIZE - 1)) == 0);
|
||||||
|
if (can_vec) {
|
||||||
|
int num_vec = len / VEC_SIZE;
|
||||||
|
|
||||||
|
using vin_t = vec_n_t<InT, VEC_SIZE>;
|
||||||
|
using vout_t = vec_n_t<OutT, VEC_SIZE>;
|
||||||
|
auto* v_in = reinterpret_cast<const vin_t*>(in);
|
||||||
|
auto* v_out = reinterpret_cast<vout_t*>(out);
|
||||||
|
|
||||||
|
for (int i = tid; i < num_vec; i += stride) {
|
||||||
|
vout_t tmp;
|
||||||
|
vec_op(tmp, v_in[i]);
|
||||||
|
v_out[i] = tmp;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
int misalignment_offset = addr & (WIDTH - 1); // addr % 64
|
int misalignment_offset = addr & (WIDTH - 1); // addr % 64
|
||||||
int alignment_bytes = WIDTH - misalignment_offset; // 64 - (addr % 64)
|
int alignment_bytes = WIDTH - misalignment_offset; // 64 - (addr % 64)
|
||||||
int prefix_elems = alignment_bytes & (WIDTH - 1); // handle 64
|
int prefix_elems = alignment_bytes & (WIDTH - 1); // handle 64
|
||||||
@@ -72,4 +92,81 @@ __device__ __forceinline__ void vectorize_with_alignment(const InT* in,
|
|||||||
std::forward<ScaOp>(scalar_op));
|
std::forward<ScaOp>(scalar_op));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <int VEC_SIZE, typename InT, typename ScaOp>
|
||||||
|
struct DefaultReadVecOp {
|
||||||
|
ScaOp scalar_op;
|
||||||
|
|
||||||
|
__device__ __forceinline__ void operator()(
|
||||||
|
const vec_n_t<InT, VEC_SIZE>& src) const {
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < VEC_SIZE; ++i) {
|
||||||
|
scalar_op(src.val[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// read-only version: iterate over the input with alignment guarantees
|
||||||
|
template <int VEC_SIZE, typename InT, typename VecOp, typename ScaOp>
|
||||||
|
__device__ inline void vectorize_read_with_alignment(const InT* in, int len,
|
||||||
|
int tid, int stride,
|
||||||
|
VecOp&& vec_op,
|
||||||
|
ScaOp&& scalar_op) {
|
||||||
|
static_assert(VEC_SIZE > 0 && (VEC_SIZE & (VEC_SIZE - 1)) == 0,
|
||||||
|
"VEC_SIZE must be a positive power-of-two");
|
||||||
|
constexpr int WIDTH = VEC_SIZE * sizeof(InT);
|
||||||
|
uintptr_t addr = reinterpret_cast<uintptr_t>(in);
|
||||||
|
|
||||||
|
// fast path when the whole region is already aligned
|
||||||
|
bool can_vec = ((addr & (WIDTH - 1)) == 0) && ((len & (VEC_SIZE - 1)) == 0);
|
||||||
|
if (can_vec) {
|
||||||
|
int num_vec = len / VEC_SIZE;
|
||||||
|
|
||||||
|
using vin_t = vec_n_t<InT, VEC_SIZE>;
|
||||||
|
auto* v_in = reinterpret_cast<const vin_t*>(in);
|
||||||
|
|
||||||
|
for (int i = tid; i < num_vec; i += stride) {
|
||||||
|
vec_op(v_in[i]);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int misalignment_offset = addr & (WIDTH - 1);
|
||||||
|
int alignment_bytes = WIDTH - misalignment_offset;
|
||||||
|
int prefix_elems = alignment_bytes & (WIDTH - 1);
|
||||||
|
prefix_elems /= sizeof(InT);
|
||||||
|
prefix_elems = min(prefix_elems, len);
|
||||||
|
|
||||||
|
// 1. handle the possibly unaligned prefix with scalar access.
|
||||||
|
for (int i = tid; i < prefix_elems; i += stride) {
|
||||||
|
scalar_op(in[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
in += prefix_elems;
|
||||||
|
len -= prefix_elems;
|
||||||
|
|
||||||
|
int num_vec = len / VEC_SIZE;
|
||||||
|
using vin_t = vec_n_t<InT, VEC_SIZE>;
|
||||||
|
auto* v_in = reinterpret_cast<const vin_t*>(in);
|
||||||
|
|
||||||
|
// 2. vectorized traversal of the main aligned region.
|
||||||
|
for (int i = tid; i < num_vec; i += stride) {
|
||||||
|
vec_op(v_in[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. handle remaining tail elements.
|
||||||
|
int tail_start = num_vec * VEC_SIZE;
|
||||||
|
for (int i = tid + tail_start; i < len; i += stride) {
|
||||||
|
scalar_op(in[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// overload that requires only a scalar_op
|
||||||
|
template <int VEC_SIZE, typename InT, typename ScaOp>
|
||||||
|
__device__ __forceinline__ void vectorize_read_with_alignment(
|
||||||
|
const InT* in, int len, int tid, int stride, ScaOp&& scalar_op) {
|
||||||
|
using Vec = DefaultReadVecOp<VEC_SIZE, InT, std::decay_t<ScaOp>>;
|
||||||
|
vectorize_read_with_alignment<VEC_SIZE>(in, len, tid, stride, Vec{scalar_op},
|
||||||
|
std::forward<ScaOp>(scalar_op));
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|||||||
@@ -59,6 +59,8 @@ void apply_repetition_penalties_(
|
|||||||
int vocab_size = logits.size(-1);
|
int vocab_size = logits.size(-1);
|
||||||
int num_seqs = logits.size(0);
|
int num_seqs = logits.size(0);
|
||||||
|
|
||||||
|
if (num_seqs == 0) return;
|
||||||
|
|
||||||
// Get number of SMs on the current device
|
// Get number of SMs on the current device
|
||||||
int sms = 0;
|
int sms = 0;
|
||||||
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount,
|
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount,
|
||||||
|
|||||||
@@ -79,7 +79,8 @@ struct cutlass_sparse_3x_gemm {
|
|||||||
// These are the minimum alignments needed for the kernels to compile
|
// These are the minimum alignments needed for the kernels to compile
|
||||||
static constexpr int AlignmentAB =
|
static constexpr int AlignmentAB =
|
||||||
128 / cutlass::sizeof_bits<ElementAB>::value;
|
128 / cutlass::sizeof_bits<ElementAB>::value;
|
||||||
static constexpr int AlignmentCD = 4;
|
static constexpr int AlignmentCD =
|
||||||
|
128 / cutlass::sizeof_bits<ElementD>::value;
|
||||||
|
|
||||||
using CollectiveEpilogue =
|
using CollectiveEpilogue =
|
||||||
typename cutlass::epilogue::collective::CollectiveBuilder<
|
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||||
|
|||||||
@@ -393,6 +393,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
{stride_tag});
|
{stride_tag});
|
||||||
ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
|
ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
|
||||||
|
|
||||||
|
// cutlass blockwise scaledgroup GEMM
|
||||||
|
ops.def(
|
||||||
|
"cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, "
|
||||||
|
"Tensor scales_a, Tensor scales_b, "
|
||||||
|
"Tensor problem_sizes, Tensor expert_offsets) -> ()",
|
||||||
|
{stride_tag});
|
||||||
|
// conditionally compiled so impl registration is in source file
|
||||||
|
|
||||||
// cutlass nvfp4 block scaled group GEMM
|
// cutlass nvfp4 block scaled group GEMM
|
||||||
ops.def(
|
ops.def(
|
||||||
"cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b,"
|
"cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b,"
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
|
||||||
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
||||||
# to run the OpenAI compatible server.
|
# to run the OpenAI compatible server.
|
||||||
|
|
||||||
@@ -62,12 +63,16 @@ ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
|
|||||||
ARG PIP_KEYRING_PROVIDER=disabled
|
ARG PIP_KEYRING_PROVIDER=disabled
|
||||||
ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER}
|
ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER}
|
||||||
|
|
||||||
|
# Flag enables build-in KV-connector dependency libs into docker images
|
||||||
|
ARG INSTALL_KV_CONNECTORS=false
|
||||||
|
|
||||||
#################### BASE BUILD IMAGE ####################
|
#################### BASE BUILD IMAGE ####################
|
||||||
# prepare basic build environment
|
# prepare basic build environment
|
||||||
FROM ${BUILD_BASE_IMAGE} AS base
|
FROM ${BUILD_BASE_IMAGE} AS base
|
||||||
ARG CUDA_VERSION
|
ARG CUDA_VERSION
|
||||||
ARG PYTHON_VERSION
|
ARG PYTHON_VERSION
|
||||||
ARG TARGETPLATFORM
|
ARG TARGETPLATFORM
|
||||||
|
ARG INSTALL_KV_CONNECTORS=false
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
ARG DEADSNAKES_MIRROR_URL
|
ARG DEADSNAKES_MIRROR_URL
|
||||||
@@ -276,6 +281,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
FROM ${FINAL_BASE_IMAGE} AS vllm-base
|
FROM ${FINAL_BASE_IMAGE} AS vllm-base
|
||||||
ARG CUDA_VERSION
|
ARG CUDA_VERSION
|
||||||
ARG PYTHON_VERSION
|
ARG PYTHON_VERSION
|
||||||
|
ARG INSTALL_KV_CONNECTORS=false
|
||||||
WORKDIR /vllm-workspace
|
WORKDIR /vllm-workspace
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
ARG TARGETPLATFORM
|
ARG TARGETPLATFORM
|
||||||
@@ -485,6 +491,7 @@ RUN mv mkdocs.yaml test_docs/
|
|||||||
# base openai image with additional requirements, for any subsequent openai-style images
|
# base openai image with additional requirements, for any subsequent openai-style images
|
||||||
FROM vllm-base AS vllm-openai-base
|
FROM vllm-base AS vllm-openai-base
|
||||||
ARG TARGETPLATFORM
|
ARG TARGETPLATFORM
|
||||||
|
ARG INSTALL_KV_CONNECTORS=false
|
||||||
|
|
||||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||||
@@ -493,8 +500,13 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
|||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
|
|
||||||
|
COPY requirements/kv_connectors.txt requirements/kv_connectors.txt
|
||||||
|
|
||||||
# install additional dependencies for openai api server
|
# install additional dependencies for openai api server
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
|
||||||
|
uv pip install --system -r requirements/kv_connectors.txt; \
|
||||||
|
fi; \
|
||||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||||
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
||||||
else \
|
else \
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ Before setting up the incremental build:
|
|||||||
VLLM_USE_PRECOMPILED=1 uv pip install -U -e . --torch-backend=auto
|
VLLM_USE_PRECOMPILED=1 uv pip install -U -e . --torch-backend=auto
|
||||||
```
|
```
|
||||||
|
|
||||||
2. **CUDA Toolkit:** Verify that the NVIDIA CUDA Toolkit is correctly installed and `nvcc` is accessible in your `PATH`. CMake relies on `nvcc` to compile CUDA code. You can typically find `nvcc` in `$CUDA_HOME/bin/nvcc` or by running `which nvcc`. If you encounter issues, refer to the [official CUDA Toolkit installation guides](https://developer.nvidia.com/cuda-toolkit-archive) and vLLM's main [GPU installation documentation](../getting_started/installation/gpu/cuda.inc.md#troubleshooting) for troubleshooting. The `CMAKE_CUDA_COMPILER` variable in your `CMakeUserPresets.json` should also point to your `nvcc` binary.
|
2. **CUDA Toolkit:** Verify that the NVIDIA CUDA Toolkit is correctly installed and `nvcc` is accessible in your `PATH`. CMake relies on `nvcc` to compile CUDA code. You can typically find `nvcc` in `$CUDA_HOME/bin/nvcc` or by running `which nvcc`. If you encounter issues, refer to the [official CUDA Toolkit installation guides](https://developer.nvidia.com/cuda-toolkit-archive) and vLLM's main [GPU installation documentation](../getting_started/installation/gpu.md#troubleshooting) for troubleshooting. The `CMAKE_CUDA_COMPILER` variable in your `CMakeUserPresets.json` should also point to your `nvcc` binary.
|
||||||
|
|
||||||
3. **Build Tools:** It is highly recommended to install `ccache` for fast rebuilds by caching compilation results (e.g., `sudo apt install ccache` or `conda install ccache`). Also, ensure the core build dependencies like `cmake` and `ninja` are installed. These are installable through `requirements/build.txt` or your system's package manager.
|
3. **Build Tools:** It is highly recommended to install `ccache` for fast rebuilds by caching compilation results (e.g., `sudo apt install ccache` or `conda install ccache`). Also, ensure the core build dependencies like `cmake` and `ninja` are installed. These are installable through `requirements/build.txt` or your system's package manager.
|
||||||
|
|
||||||
|
|||||||
@@ -101,6 +101,49 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
|
|||||||
|
|
||||||
Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
|
Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
|
||||||
|
|
||||||
|
If using the [LLM.chat](https://docs.vllm.ai/en/stable/models/generative_models.html#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from vllm import LLM
|
||||||
|
from vllm.assets.image import ImageAsset
|
||||||
|
|
||||||
|
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
||||||
|
image_url = "https://picsum.photos/id/32/512/512"
|
||||||
|
image_pil = ImageAsset('cherry_blossom').pil_image
|
||||||
|
image_embeds = torch.load(...)
|
||||||
|
|
||||||
|
conversation = [
|
||||||
|
{"role": "system", "content": "You are a helpful assistant"},
|
||||||
|
{"role": "user", "content": "Hello"},
|
||||||
|
{"role": "assistant", "content": "Hello! How can I assist you today?"},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": image_url
|
||||||
|
}
|
||||||
|
},{
|
||||||
|
"type": "image_pil",
|
||||||
|
"image_pil": image_pil
|
||||||
|
}, {
|
||||||
|
"type": "image_embeds",
|
||||||
|
"image_embeds": image_embeds
|
||||||
|
}, {
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's in these images?"
|
||||||
|
}],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
# Perform inference and log output.
|
||||||
|
outputs = llm.chat(conversation)
|
||||||
|
|
||||||
|
for o in outputs:
|
||||||
|
generated_text = o.outputs[0].text
|
||||||
|
print(generated_text)
|
||||||
|
```
|
||||||
|
|
||||||
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
|
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
|
||||||
|
|
||||||
??? Code
|
??? Code
|
||||||
@@ -228,7 +271,7 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
|
|||||||
If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>.
|
If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>.
|
||||||
If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
|
If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
|
||||||
|
|
||||||
For certain models, we provide alternative chat templates inside <gh-dir:vllm/examples>.
|
For certain models, we provide alternative chat templates inside <gh-dir:examples>.
|
||||||
For example, VLM2Vec uses <gh-file:examples/template_vlm2vec.jinja> which is different from the default one for Phi-3-Vision.
|
For example, VLM2Vec uses <gh-file:examples/template_vlm2vec.jinja> which is different from the default one for Phi-3-Vision.
|
||||||
|
|
||||||
### Image Inputs
|
### Image Inputs
|
||||||
|
|||||||
@@ -470,6 +470,7 @@ Specified using `--task classify`.
|
|||||||
|----------------------------------|----------|----------------------------------------|------------------------|-----------------------------|-----------------------|
|
|----------------------------------|----------|----------------------------------------|------------------------|-----------------------------|-----------------------|
|
||||||
| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | |
|
| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | |
|
||||||
| `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ |
|
| `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ |
|
||||||
|
|
||||||
If your model is not in the above list, we will try to automatically convert the model using
|
If your model is not in the above list, we will try to automatically convert the model using
|
||||||
[as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
|
[as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
|
||||||
|
|
||||||
@@ -477,12 +478,20 @@ If your model is not in the above list, we will try to automatically convert the
|
|||||||
|
|
||||||
Specified using `--task score`.
|
Specified using `--task score`.
|
||||||
|
|
||||||
| Architecture | Models | Example HF Models | [V1](gh-issue:8779) |
|
| Architecture | Models | Example HF Models | [V1](gh-issue:8779) |
|
||||||
|---------------------------------------|-------------------|--------------------------------------------------------------------------------------|-----------------------|
|
|---------------------------------------|-------------------|--------------------------------------------------------------------------------------|---------------------|
|
||||||
| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | |
|
| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | |
|
||||||
| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ |
|
| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ |
|
||||||
| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | |
|
| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ |
|
||||||
| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | |
|
| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | |
|
||||||
|
| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | |
|
||||||
|
|
||||||
|
!!! note
|
||||||
|
Load the official original `mxbai-rerank-v2` by using the following command.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
vllm serve mixedbread-ai/mxbai-rerank-base-v2 --hf_overrides '{"architectures": ["Qwen2ForSequenceClassification"],"classifier_from_token": ["0", "1"], "method": "from_2_way_softmax"}'
|
||||||
|
```
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: <gh-file:examples/offline_inference/qwen3_reranker.py>.
|
Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: <gh-file:examples/offline_inference/qwen3_reranker.py>.
|
||||||
@@ -490,6 +499,7 @@ Specified using `--task score`.
|
|||||||
```bash
|
```bash
|
||||||
vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
|
vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
|
||||||
```
|
```
|
||||||
|
|
||||||
[](){ #supported-mm-models }
|
[](){ #supported-mm-models }
|
||||||
|
|
||||||
## List of Multimodal Language Models
|
## List of Multimodal Language Models
|
||||||
@@ -616,9 +626,6 @@ Specified using `--task generate`.
|
|||||||
!!! note
|
!!! note
|
||||||
Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently.
|
Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently.
|
||||||
|
|
||||||
!!! note
|
|
||||||
`h2oai/h2ovl-mississippi-2b` will be available in V1 once we support head size 80.
|
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
|
To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
|
||||||
|
|
||||||
@@ -661,11 +668,8 @@ Specified using `--task generate`.
|
|||||||
Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
|
Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
To use Qwen2.5-Omni, you have to install Hugging Face Transformers library from source via
|
For Qwen2.5-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`)
|
||||||
`pip install git+https://github.com/huggingface/transformers.git`.
|
is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
|
||||||
|
|
||||||
Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
|
|
||||||
`--mm-processor-kwargs '{"use_audio_in_video": true}'`.
|
|
||||||
|
|
||||||
#### Transcription
|
#### Transcription
|
||||||
|
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import argparse
|
|||||||
|
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
|
from vllm.assets.image import ImageAsset
|
||||||
|
|
||||||
# This script is an offline demo for running Mistral-Small-3.1
|
# This script is an offline demo for running Mistral-Small-3.1
|
||||||
#
|
#
|
||||||
@@ -71,14 +72,16 @@ def run_simple_demo(args: argparse.Namespace):
|
|||||||
)
|
)
|
||||||
|
|
||||||
prompt = "Describe this image in one sentence."
|
prompt = "Describe this image in one sentence."
|
||||||
image_url = "https://picsum.photos/id/237/200/300"
|
|
||||||
|
|
||||||
messages = [
|
messages = [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": [
|
"content": [
|
||||||
{"type": "text", "text": prompt},
|
{"type": "text", "text": prompt},
|
||||||
{"type": "image_url", "image_url": {"url": image_url}},
|
{
|
||||||
|
"type": "image_pil",
|
||||||
|
"image_pil": ImageAsset("cherry_blossom").pil_image,
|
||||||
|
},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -57,7 +57,10 @@ Once you have collected your profiles with this script, you can visualize them u
|
|||||||
Here are most likely the dependencies you need to install:
|
Here are most likely the dependencies you need to install:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install tensorflow-cpu tensorboard-plugin-profile etils importlib_resources
|
pip install tensorflow-cpu \
|
||||||
|
tensorboard-plugin-profile \
|
||||||
|
etils \
|
||||||
|
importlib_resources
|
||||||
```
|
```
|
||||||
|
|
||||||
Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser:
|
Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser:
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
|||||||
@@ -98,7 +98,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
|
|||||||
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
|
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
|
||||||
prompts = [f"Question: {question} Answer:" for question in questions]
|
prompts = [f"Question: {question} Answer:" for question in questions]
|
||||||
engine_args = EngineArgs(
|
engine_args = EngineArgs(
|
||||||
model="Salesforce/blip2-opt-6.7b",
|
model="Salesforce/blip2-opt-2.7b",
|
||||||
limit_mm_per_prompt={modality: 1},
|
limit_mm_per_prompt={modality: 1},
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -677,6 +677,7 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
|
|||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
limit_mm_per_prompt={modality: 1},
|
limit_mm_per_prompt={modality: 1},
|
||||||
|
ignore_patterns=["consolidated.safetensors"],
|
||||||
)
|
)
|
||||||
|
|
||||||
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
|
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
|
||||||
@@ -970,7 +971,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# Qwen
|
# Qwen-VL
|
||||||
def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
|
def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
|
|
||||||
|
|||||||
@@ -505,6 +505,7 @@ def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
|
|||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
limit_mm_per_prompt={"image": len(image_urls)},
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
|
ignore_patterns=["consolidated.safetensors"],
|
||||||
)
|
)
|
||||||
|
|
||||||
placeholders = "[IMG]" * len(image_urls)
|
placeholders = "[IMG]" * len(image_urls)
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import socket
|
import socket
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import asyncio
|
import asyncio
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
# ruff: noqa: E501
|
# ruff: noqa: E501
|
||||||
"""
|
"""
|
||||||
Set up this example by starting a vLLM OpenAI-compatible server with tool call
|
Set up this example by starting a vLLM OpenAI-compatible server with tool call
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
# ruff: noqa: E501
|
# ruff: noqa: E501
|
||||||
"""
|
"""
|
||||||
Set up this example by starting a vLLM OpenAI-compatible server with tool call
|
Set up this example by starting a vLLM OpenAI-compatible server with tool call
|
||||||
|
|||||||
@@ -13,13 +13,15 @@ vllm serve Qwen/Qwen2.5-3B-Instruct
|
|||||||
To serve a reasoning model, you can use the following command:
|
To serve a reasoning model, you can use the following command:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r1
|
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \
|
||||||
|
--reasoning-parser deepseek_r1
|
||||||
```
|
```
|
||||||
|
|
||||||
If you want to run this script standalone with `uv`, you can use the following:
|
If you want to run this script standalone with `uv`, you can use the following:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs structured-output
|
uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs \
|
||||||
|
structured-output
|
||||||
```
|
```
|
||||||
|
|
||||||
See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.html) for more information.
|
See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.html) for more information.
|
||||||
@@ -44,7 +46,9 @@ uv run structured_outputs.py --stream
|
|||||||
Run certain constraints, for example `structural_tag` and `regex`, streaming:
|
Run certain constraints, for example `structural_tag` and `regex`, streaming:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
uv run structured_outputs.py --constraint structural_tag regex --stream
|
uv run structured_outputs.py \
|
||||||
|
--constraint structural_tag regex \
|
||||||
|
--stream
|
||||||
```
|
```
|
||||||
|
|
||||||
Run all constraints, with reasoning models and streaming:
|
Run all constraints, with reasoning models and streaming:
|
||||||
|
|||||||
@@ -202,7 +202,7 @@ def parse_args():
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
def deserialize():
|
def deserialize(args, tensorizer_config):
|
||||||
if args.lora_path:
|
if args.lora_path:
|
||||||
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
|
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
|
||||||
llm = LLM(model=args.model,
|
llm = LLM(model=args.model,
|
||||||
@@ -242,7 +242,7 @@ def deserialize():
|
|||||||
return llm
|
return llm
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
def main():
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
s3_access_key_id = (getattr(args, 's3_access_key_id', None)
|
s3_access_key_id = (getattr(args, 's3_access_key_id', None)
|
||||||
@@ -260,8 +260,6 @@ if __name__ == '__main__':
|
|||||||
|
|
||||||
model_ref = args.model
|
model_ref = args.model
|
||||||
|
|
||||||
model_name = model_ref.split("/")[1]
|
|
||||||
|
|
||||||
if args.command == "serialize" or args.command == "deserialize":
|
if args.command == "serialize" or args.command == "deserialize":
|
||||||
keyfile = args.keyfile
|
keyfile = args.keyfile
|
||||||
else:
|
else:
|
||||||
@@ -309,6 +307,10 @@ if __name__ == '__main__':
|
|||||||
encryption_keyfile = keyfile,
|
encryption_keyfile = keyfile,
|
||||||
**credentials
|
**credentials
|
||||||
)
|
)
|
||||||
deserialize()
|
deserialize(args, tensorizer_config)
|
||||||
else:
|
else:
|
||||||
raise ValueError("Either serialize or deserialize must be specified.")
|
raise ValueError("Either serialize or deserialize must be specified.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|||||||
1
requirements/kv_connectors.txt
Normal file
1
requirements/kv_connectors.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
lmcache
|
||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|||||||
57
tests/config/test_mp_reducer.py
Normal file
57
tests/config/test_mp_reducer.py
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import sys
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from vllm.config import VllmConfig
|
||||||
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||||
|
from vllm.v1.engine.async_llm import AsyncLLM
|
||||||
|
|
||||||
|
|
||||||
|
def test_mp_reducer(monkeypatch):
|
||||||
|
"""
|
||||||
|
Test that _reduce_config reducer is registered when AsyncLLM is instantiated
|
||||||
|
without transformers_modules. This is a regression test for
|
||||||
|
https://github.com/vllm-project/vllm/pull/18640.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Use V1 AsyncLLM which calls maybe_register_config_serialize_by_value
|
||||||
|
monkeypatch.setenv('VLLM_USE_V1', '1')
|
||||||
|
|
||||||
|
# Ensure transformers_modules is not in sys.modules
|
||||||
|
if 'transformers_modules' in sys.modules:
|
||||||
|
del sys.modules['transformers_modules']
|
||||||
|
|
||||||
|
with patch('multiprocessing.reducer.register') as mock_register:
|
||||||
|
engine_args = AsyncEngineArgs(
|
||||||
|
model="facebook/opt-125m",
|
||||||
|
max_model_len=32,
|
||||||
|
gpu_memory_utilization=0.1,
|
||||||
|
disable_log_stats=True,
|
||||||
|
disable_log_requests=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
async_llm = AsyncLLM.from_engine_args(
|
||||||
|
engine_args,
|
||||||
|
start_engine_loop=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert mock_register.called, (
|
||||||
|
"multiprocessing.reducer.register should have been called")
|
||||||
|
|
||||||
|
vllm_config_registered = False
|
||||||
|
for call_args in mock_register.call_args_list:
|
||||||
|
# Verify that a reducer for VllmConfig was registered
|
||||||
|
if len(call_args[0]) >= 2 and call_args[0][0] == VllmConfig:
|
||||||
|
vllm_config_registered = True
|
||||||
|
|
||||||
|
reducer_func = call_args[0][1]
|
||||||
|
assert callable(
|
||||||
|
reducer_func), "Reducer function should be callable"
|
||||||
|
break
|
||||||
|
|
||||||
|
assert vllm_config_registered, (
|
||||||
|
"VllmConfig should have been registered to multiprocessing.reducer"
|
||||||
|
)
|
||||||
|
|
||||||
|
async_llm.shutdown()
|
||||||
@@ -37,7 +37,6 @@ async def test_basic_audio(mary_had_lamb):
|
|||||||
model_name = "openai/whisper-large-v3-turbo"
|
model_name = "openai/whisper-large-v3-turbo"
|
||||||
server_args = ["--enforce-eager"]
|
server_args = ["--enforce-eager"]
|
||||||
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
||||||
prompt = "THE FIRST WORDS I SPOKE"
|
|
||||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||||
client = remote_server.get_async_client()
|
client = remote_server.get_async_client()
|
||||||
transcription = await client.audio.transcriptions.create(
|
transcription = await client.audio.transcriptions.create(
|
||||||
@@ -48,16 +47,6 @@ async def test_basic_audio(mary_had_lamb):
|
|||||||
temperature=0.0)
|
temperature=0.0)
|
||||||
out = json.loads(transcription)['text']
|
out = json.loads(transcription)['text']
|
||||||
assert "Mary had a little lamb," in out
|
assert "Mary had a little lamb," in out
|
||||||
# This should "force" whisper to continue prompt in all caps
|
|
||||||
transcription_wprompt = await client.audio.transcriptions.create(
|
|
||||||
model=model_name,
|
|
||||||
file=mary_had_lamb,
|
|
||||||
language="en",
|
|
||||||
response_format="text",
|
|
||||||
prompt=prompt,
|
|
||||||
temperature=0.0)
|
|
||||||
out_capital = json.loads(transcription_wprompt)['text']
|
|
||||||
assert prompt not in out_capital
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@@ -238,3 +227,31 @@ async def test_sampling_params(mary_had_lamb):
|
|||||||
extra_body=dict(seed=42))
|
extra_body=dict(seed=42))
|
||||||
|
|
||||||
assert greedy_transcription.text != transcription.text
|
assert greedy_transcription.text != transcription.text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_audio_prompt(mary_had_lamb):
|
||||||
|
model_name = "openai/whisper-large-v3-turbo"
|
||||||
|
server_args = ["--enforce-eager"]
|
||||||
|
prompt = "This is a speech, recorded in a phonograph."
|
||||||
|
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||||
|
#Prompts should not omit the part of original prompt while transcribing.
|
||||||
|
prefix = "The first words I spoke in the original phonograph"
|
||||||
|
client = remote_server.get_async_client()
|
||||||
|
transcription = await client.audio.transcriptions.create(
|
||||||
|
model=model_name,
|
||||||
|
file=mary_had_lamb,
|
||||||
|
language="en",
|
||||||
|
response_format="text",
|
||||||
|
temperature=0.0)
|
||||||
|
out = json.loads(transcription)['text']
|
||||||
|
assert prefix in out
|
||||||
|
transcription_wprompt = await client.audio.transcriptions.create(
|
||||||
|
model=model_name,
|
||||||
|
file=mary_had_lamb,
|
||||||
|
language="en",
|
||||||
|
response_format="text",
|
||||||
|
prompt=prompt,
|
||||||
|
temperature=0.0)
|
||||||
|
out_prompt = json.loads(transcription_wprompt)['text']
|
||||||
|
assert prefix in out_prompt
|
||||||
|
|||||||
@@ -264,10 +264,8 @@ def test_parse_chat_messages_multiple_images(
|
|||||||
"url": image_url
|
"url": image_url
|
||||||
}
|
}
|
||||||
}, {
|
}, {
|
||||||
"type": "image_url",
|
"type": "image_pil",
|
||||||
"image_url": {
|
"image_pil": ImageAsset('cherry_blossom').pil_image
|
||||||
"url": image_url
|
|
||||||
}
|
|
||||||
}, {
|
}, {
|
||||||
"type": "text",
|
"type": "text",
|
||||||
"text": "What's in these images?"
|
"text": "What's in these images?"
|
||||||
@@ -303,10 +301,8 @@ async def test_parse_chat_messages_multiple_images_async(
|
|||||||
"url": image_url
|
"url": image_url
|
||||||
}
|
}
|
||||||
}, {
|
}, {
|
||||||
"type": "image_url",
|
"type": "image_pil",
|
||||||
"image_url": {
|
"image_pil": ImageAsset('cherry_blossom').pil_image
|
||||||
"url": image_url
|
|
||||||
}
|
|
||||||
}, {
|
}, {
|
||||||
"type": "text",
|
"type": "text",
|
||||||
"text": "What's in these images?"
|
"text": "What's in these images?"
|
||||||
|
|||||||
@@ -450,7 +450,8 @@ def test_multi_query_kv_attention(
|
|||||||
start += seq_len
|
start += seq_len
|
||||||
# xformers.AttentionBias to Tensor for use in reference impl.
|
# xformers.AttentionBias to Tensor for use in reference impl.
|
||||||
alibi_bias = [
|
alibi_bias = [
|
||||||
b.materialize(b.shape, device=device).squeeze() for b in attn_bias
|
b.materialize((1, num_query_heads, i, i), device=device).squeeze()
|
||||||
|
for b, i in zip(attn_bias, seq_lens)
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
|
attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
|
||||||
|
|||||||
@@ -171,7 +171,7 @@ def test_env(
|
|||||||
expected = "FLASHINFER_VLLM_V1" if use_v1 else name
|
expected = "FLASHINFER_VLLM_V1" if use_v1 else name
|
||||||
assert backend.get_name() == expected
|
assert backend.get_name() == expected
|
||||||
else:
|
else:
|
||||||
backend = get_attn_backend(16,
|
backend = get_attn_backend(32,
|
||||||
torch.float16,
|
torch.float16,
|
||||||
torch.float16,
|
torch.float16,
|
||||||
block_size,
|
block_size,
|
||||||
@@ -180,6 +180,45 @@ def test_env(
|
|||||||
expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name
|
expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name
|
||||||
assert backend.get_name() == expected
|
assert backend.get_name() == expected
|
||||||
|
|
||||||
|
if use_v1:
|
||||||
|
backend = get_attn_backend(16,
|
||||||
|
torch.float16,
|
||||||
|
torch.float16,
|
||||||
|
block_size,
|
||||||
|
False,
|
||||||
|
use_mla=use_mla)
|
||||||
|
assert backend.get_name() == "FLEX_ATTENTION", (
|
||||||
|
"Should fallback to FlexAttention if head size is "
|
||||||
|
"not supported by FlashAttention")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("device", ["cpu", "cuda"])
|
||||||
|
@pytest.mark.parametrize("use_v1", [True, False])
|
||||||
|
def test_fp32_fallback(
|
||||||
|
device: str,
|
||||||
|
use_v1: bool,
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
):
|
||||||
|
"""Test attention backend selection with fp32."""
|
||||||
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
||||||
|
|
||||||
|
if device == "cpu":
|
||||||
|
with patch("vllm.attention.selector.current_platform",
|
||||||
|
CpuPlatform()):
|
||||||
|
backend = get_attn_backend(16, torch.float32, torch.float32,
|
||||||
|
16, False)
|
||||||
|
assert (backend.get_name() == "TORCH_SDPA_VLLM_V1"
|
||||||
|
if use_v1 else "TORCH_SDPA")
|
||||||
|
|
||||||
|
elif device == "cuda":
|
||||||
|
with patch("vllm.attention.selector.current_platform",
|
||||||
|
CudaPlatform()):
|
||||||
|
backend = get_attn_backend(16, torch.float32, torch.float32,
|
||||||
|
16, False)
|
||||||
|
assert (backend.get_name() == "FLEX_ATTENTION"
|
||||||
|
if use_v1 else "XFORMERS")
|
||||||
|
|
||||||
|
|
||||||
def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
|
def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
|
||||||
"""Test FlashAttn validation."""
|
"""Test FlashAttn validation."""
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""
|
"""
|
||||||
DeepEP test utilities
|
DeepEP test utilities
|
||||||
"""
|
"""
|
||||||
@@ -137,8 +138,7 @@ def make_deepep_ht_a2a(pg: ProcessGroup,
|
|||||||
low_latency_mode=low_latency_mode,
|
low_latency_mode=low_latency_mode,
|
||||||
num_qps_per_rank=num_qps_per_rank)
|
num_qps_per_rank=num_qps_per_rank)
|
||||||
return DeepEPHTPrepareAndFinalize(buffer=buffer,
|
return DeepEPHTPrepareAndFinalize(buffer=buffer,
|
||||||
world_size=pgi.world_size,
|
num_dispatchers=pgi.world_size,
|
||||||
rank=pgi.rank,
|
|
||||||
dp_size=dp_size,
|
dp_size=dp_size,
|
||||||
rank_expert_offset=pgi.rank *
|
rank_expert_offset=pgi.rank *
|
||||||
ht_args.num_local_experts)
|
ht_args.num_local_experts)
|
||||||
@@ -146,7 +146,6 @@ def make_deepep_ht_a2a(pg: ProcessGroup,
|
|||||||
|
|
||||||
def make_deepep_ll_a2a(pg: ProcessGroup,
|
def make_deepep_ll_a2a(pg: ProcessGroup,
|
||||||
pgi: ProcessGroupInfo,
|
pgi: ProcessGroupInfo,
|
||||||
dp_size: int,
|
|
||||||
deepep_ll_args: DeepEPLLArgs,
|
deepep_ll_args: DeepEPLLArgs,
|
||||||
q_dtype: Optional[torch.dtype] = None,
|
q_dtype: Optional[torch.dtype] = None,
|
||||||
block_shape: Optional[list[int]] = None):
|
block_shape: Optional[list[int]] = None):
|
||||||
@@ -166,8 +165,7 @@ def make_deepep_ll_a2a(pg: ProcessGroup,
|
|||||||
|
|
||||||
return DeepEPLLPrepareAndFinalize(
|
return DeepEPLLPrepareAndFinalize(
|
||||||
buffer=buffer,
|
buffer=buffer,
|
||||||
world_size=pgi.world_size,
|
num_dispatchers=pgi.world_size,
|
||||||
dp_size=dp_size,
|
|
||||||
max_tokens_per_rank=deepep_ll_args.max_tokens_per_rank,
|
max_tokens_per_rank=deepep_ll_args.max_tokens_per_rank,
|
||||||
use_fp8_dispatch=deepep_ll_args.use_fp8_dispatch,
|
use_fp8_dispatch=deepep_ll_args.use_fp8_dispatch,
|
||||||
)
|
)
|
||||||
@@ -186,5 +184,4 @@ def make_deepep_a2a(pg: ProcessGroup,
|
|||||||
block_shape)
|
block_shape)
|
||||||
|
|
||||||
assert deepep_ll_args is not None
|
assert deepep_ll_args is not None
|
||||||
return make_deepep_ll_a2a(pg, pgi, dp_size, deepep_ll_args, q_dtype,
|
return make_deepep_ll_a2a(pg, pgi, deepep_ll_args, q_dtype, block_shape)
|
||||||
block_shape)
|
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ import triton.language as tl
|
|||||||
|
|
||||||
from tests.kernels.moe.utils import (batched_moe,
|
from tests.kernels.moe.utils import (batched_moe,
|
||||||
make_quantized_test_activations,
|
make_quantized_test_activations,
|
||||||
make_test_weights, triton_moe)
|
make_test_weights, naive_batched_moe)
|
||||||
from tests.kernels.quant_utils import native_batched_masked_quant_matmul
|
from tests.kernels.quant_utils import native_batched_masked_quant_matmul
|
||||||
from tests.kernels.utils import torch_experts
|
from tests.kernels.utils import torch_experts
|
||||||
from vllm.config import VllmConfig, set_current_vllm_config
|
from vllm.config import VllmConfig, set_current_vllm_config
|
||||||
@@ -33,12 +33,10 @@ MNK_FACTORS = [
|
|||||||
(45, 512, 512),
|
(45, 512, 512),
|
||||||
(45, 1024, 128),
|
(45, 1024, 128),
|
||||||
(45, 1024, 2048),
|
(45, 1024, 2048),
|
||||||
(64, 128, 128),
|
|
||||||
(64, 512, 512),
|
(64, 512, 512),
|
||||||
(64, 1024, 2048),
|
(64, 1024, 2048),
|
||||||
(222, 128, 128),
|
(222, 128, 128),
|
||||||
(222, 128, 2048),
|
(222, 128, 2048),
|
||||||
(222, 512, 512),
|
|
||||||
(222, 1024, 128),
|
(222, 1024, 128),
|
||||||
(222, 1024, 2048),
|
(222, 1024, 2048),
|
||||||
]
|
]
|
||||||
@@ -95,11 +93,12 @@ class BatchedMMTensors:
|
|||||||
@pytest.mark.parametrize("max_tokens_per_expert",
|
@pytest.mark.parametrize("max_tokens_per_expert",
|
||||||
[32, 64, 128, 192, 224, 256, 512])
|
[32, 64, 128, 192, 224, 256, 512])
|
||||||
@pytest.mark.parametrize("K", [128, 256, 1024])
|
@pytest.mark.parametrize("K", [128, 256, 1024])
|
||||||
@pytest.mark.parametrize("N", [128, 256, 512, 1024])
|
@pytest.mark.parametrize("N", [128, 256, 1024])
|
||||||
@pytest.mark.parametrize("dtype",
|
@pytest.mark.parametrize(
|
||||||
[torch.float32, torch.float16, torch.bfloat16])
|
"dtype",
|
||||||
@pytest.mark.parametrize("block_shape", [None])
|
[torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16])
|
||||||
@pytest.mark.parametrize("per_act_token_quant", [False])
|
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
|
||||||
|
@pytest.mark.parametrize("per_act_token_quant", [False, True])
|
||||||
def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
|
def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
|
||||||
N: int, dtype: torch.dtype,
|
N: int, dtype: torch.dtype,
|
||||||
block_shape: Optional[list[int]],
|
block_shape: Optional[list[int]],
|
||||||
@@ -134,7 +133,8 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
|
|||||||
in_dtype=act_dtype,
|
in_dtype=act_dtype,
|
||||||
quant_dtype=quant_dtype,
|
quant_dtype=quant_dtype,
|
||||||
block_shape=block_shape,
|
block_shape=block_shape,
|
||||||
per_act_token_quant=per_act_token_quant)
|
per_act_token_quant=per_act_token_quant,
|
||||||
|
)
|
||||||
|
|
||||||
B, B_q, B_scale, _, _, _ = make_test_weights(
|
B, B_q, B_scale, _, _, _ = make_test_weights(
|
||||||
num_experts,
|
num_experts,
|
||||||
@@ -143,6 +143,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
|
|||||||
in_dtype=act_dtype,
|
in_dtype=act_dtype,
|
||||||
quant_dtype=quant_dtype,
|
quant_dtype=quant_dtype,
|
||||||
block_shape=block_shape,
|
block_shape=block_shape,
|
||||||
|
per_act_token_quant=per_act_token_quant,
|
||||||
)
|
)
|
||||||
|
|
||||||
out_shape = (num_experts, max_tokens_per_expert, N)
|
out_shape = (num_experts, max_tokens_per_expert, N)
|
||||||
@@ -177,6 +178,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
|
|||||||
"BLOCK_SIZE_N": 16,
|
"BLOCK_SIZE_N": 16,
|
||||||
"BLOCK_SIZE_K": 16 if dtype.itemsize > 1 else 32
|
"BLOCK_SIZE_K": 16 if dtype.itemsize > 1 else 32
|
||||||
},
|
},
|
||||||
|
per_act_token_quant=per_act_token_quant,
|
||||||
block_shape=block_shape,
|
block_shape=block_shape,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -185,15 +187,13 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
|
|||||||
B,
|
B,
|
||||||
ref_output,
|
ref_output,
|
||||||
num_expert_tokens,
|
num_expert_tokens,
|
||||||
None,
|
|
||||||
None,
|
|
||||||
None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
q_ref_output = native_batched_masked_quant_matmul(A_q, B_q, q_ref_output,
|
q_ref_output = native_batched_masked_quant_matmul(A_q, B_q, q_ref_output,
|
||||||
num_expert_tokens,
|
num_expert_tokens,
|
||||||
A_scale, B_scale,
|
A_scale, B_scale,
|
||||||
block_shape)
|
block_shape,
|
||||||
|
per_act_token_quant)
|
||||||
|
|
||||||
rtol, atol = {
|
rtol, atol = {
|
||||||
torch.float16: (6e-2, 6e-2),
|
torch.float16: (6e-2, 6e-2),
|
||||||
@@ -201,16 +201,17 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
|
|||||||
torch.float32: (1e-2, 1e-2),
|
torch.float32: (1e-2, 1e-2),
|
||||||
}[test_output.dtype]
|
}[test_output.dtype]
|
||||||
|
|
||||||
torch.testing.assert_close(ref_output, test_output, atol=atol, rtol=rtol)
|
torch.testing.assert_close(ref_output, q_ref_output, atol=atol, rtol=rtol)
|
||||||
torch.testing.assert_close(test_output, q_ref_output, atol=atol, rtol=rtol)
|
torch.testing.assert_close(test_output, q_ref_output, atol=atol, rtol=rtol)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(("m", "n", "k"), MNK_FACTORS)
|
@pytest.mark.parametrize(("m", "n", "k"), MNK_FACTORS)
|
||||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||||
@pytest.mark.parametrize("topk", TOP_KS)
|
@pytest.mark.parametrize("topk", TOP_KS)
|
||||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
|
||||||
@pytest.mark.parametrize("per_act_token_quant", [False])
|
@pytest.mark.parametrize("per_act_token_quant", [False, True])
|
||||||
@pytest.mark.parametrize("block_shape", [None])
|
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
|
||||||
|
@pytest.mark.parametrize("input_scales", [False])
|
||||||
def test_fused_moe_batched_experts(
|
def test_fused_moe_batched_experts(
|
||||||
m: int,
|
m: int,
|
||||||
n: int,
|
n: int,
|
||||||
@@ -220,15 +221,19 @@ def test_fused_moe_batched_experts(
|
|||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
per_act_token_quant: bool,
|
per_act_token_quant: bool,
|
||||||
block_shape: Optional[list[int]],
|
block_shape: Optional[list[int]],
|
||||||
|
input_scales: bool,
|
||||||
):
|
):
|
||||||
current_platform.seed_everything(7)
|
current_platform.seed_everything(7)
|
||||||
|
|
||||||
use_fp8_w8a8 = dtype == torch.float8_e4m3fn
|
use_fp8_w8a8 = dtype == torch.float8_e4m3fn
|
||||||
|
|
||||||
|
if topk > e:
|
||||||
|
pytest.skip("topk > e")
|
||||||
|
|
||||||
if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
|
if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
|
||||||
pytest.skip("Skip quantization test for non-quantized type")
|
pytest.skip("Skip quantization test for non-quantized type")
|
||||||
|
|
||||||
if per_act_token_quant and block_shape is not None or topk > e:
|
if per_act_token_quant and block_shape is not None:
|
||||||
pytest.skip("Skip illegal quantization test.")
|
pytest.skip("Skip illegal quantization test.")
|
||||||
|
|
||||||
a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
|
a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
|
||||||
@@ -241,27 +246,26 @@ def test_fused_moe_batched_experts(
|
|||||||
act_dtype = dtype
|
act_dtype = dtype
|
||||||
quant_dtype = None
|
quant_dtype = None
|
||||||
|
|
||||||
_, w1, w1_s, _, w2, w2_s = make_test_weights(e,
|
w1_16, w1, w1_s, w2_16, w2, w2_s = make_test_weights(
|
||||||
n,
|
e,
|
||||||
k,
|
n,
|
||||||
block_shape=block_shape,
|
k,
|
||||||
in_dtype=act_dtype,
|
block_shape=block_shape,
|
||||||
quant_dtype=quant_dtype)
|
in_dtype=act_dtype,
|
||||||
|
quant_dtype=quant_dtype,
|
||||||
|
per_act_token_quant=per_act_token_quant,
|
||||||
|
)
|
||||||
|
|
||||||
|
if input_scales and quant_dtype is not None:
|
||||||
|
a1_scale = torch.tensor(1, device="cuda", dtype=torch.float32)
|
||||||
|
a2_scale = torch.tensor(1, device="cuda", dtype=torch.float32)
|
||||||
|
else:
|
||||||
|
a1_scale = None
|
||||||
|
a2_scale = None
|
||||||
|
|
||||||
with set_current_vllm_config(vllm_config):
|
with set_current_vllm_config(vllm_config):
|
||||||
topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
|
topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
|
||||||
batched_output = batched_moe(
|
|
||||||
a,
|
|
||||||
w1,
|
|
||||||
w2,
|
|
||||||
topk_weight,
|
|
||||||
topk_ids,
|
|
||||||
w1_scale=w1_s,
|
|
||||||
w2_scale=w2_s,
|
|
||||||
quant_dtype=quant_dtype,
|
|
||||||
per_act_token_quant=per_act_token_quant,
|
|
||||||
block_shape=block_shape,
|
|
||||||
)
|
|
||||||
baseline_output = torch_experts(
|
baseline_output = torch_experts(
|
||||||
a,
|
a,
|
||||||
w1,
|
w1,
|
||||||
@@ -270,11 +274,14 @@ def test_fused_moe_batched_experts(
|
|||||||
topk_ids,
|
topk_ids,
|
||||||
w1_scale=w1_s,
|
w1_scale=w1_s,
|
||||||
w2_scale=w2_s,
|
w2_scale=w2_s,
|
||||||
|
a1_scale=a1_scale,
|
||||||
|
a2_scale=a2_scale,
|
||||||
quant_dtype=quant_dtype,
|
quant_dtype=quant_dtype,
|
||||||
per_act_token_quant=per_act_token_quant,
|
per_act_token_quant=per_act_token_quant,
|
||||||
block_shape=block_shape)
|
block_shape=block_shape,
|
||||||
|
)
|
||||||
|
|
||||||
triton_output = triton_moe(
|
batched_output = naive_batched_moe(
|
||||||
a,
|
a,
|
||||||
w1,
|
w1,
|
||||||
w2,
|
w2,
|
||||||
@@ -282,14 +289,31 @@ def test_fused_moe_batched_experts(
|
|||||||
topk_ids,
|
topk_ids,
|
||||||
w1_scale=w1_s,
|
w1_scale=w1_s,
|
||||||
w2_scale=w2_s,
|
w2_scale=w2_s,
|
||||||
|
a1_scale=a1_scale,
|
||||||
|
a2_scale=a2_scale,
|
||||||
quant_dtype=quant_dtype,
|
quant_dtype=quant_dtype,
|
||||||
per_act_token_quant=per_act_token_quant,
|
per_act_token_quant=per_act_token_quant,
|
||||||
block_shape=block_shape,
|
block_shape=block_shape,
|
||||||
)
|
)
|
||||||
|
|
||||||
torch.testing.assert_close(triton_output,
|
triton_output = batched_moe(
|
||||||
|
a,
|
||||||
|
w1,
|
||||||
|
w2,
|
||||||
|
topk_weight,
|
||||||
|
topk_ids,
|
||||||
|
w1_scale=w1_s,
|
||||||
|
w2_scale=w2_s,
|
||||||
|
a1_scale=a1_scale,
|
||||||
|
a2_scale=a2_scale,
|
||||||
|
quant_dtype=quant_dtype,
|
||||||
|
per_act_token_quant=per_act_token_quant,
|
||||||
|
block_shape=block_shape,
|
||||||
|
)
|
||||||
|
|
||||||
|
torch.testing.assert_close(batched_output,
|
||||||
baseline_output,
|
baseline_output,
|
||||||
atol=2e-2,
|
atol=3e-2,
|
||||||
rtol=2e-2)
|
rtol=2e-2)
|
||||||
|
|
||||||
torch.testing.assert_close(triton_output,
|
torch.testing.assert_close(triton_output,
|
||||||
|
|||||||
116
tests/kernels/moe/test_cutlass_grouped_gemm.py
Normal file
116
tests/kernels/moe/test_cutlass_grouped_gemm.py
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
# DeepGEMM Style Cutlass Grouped GEMM Test
|
||||||
|
# See https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_core.py
|
||||||
|
|
||||||
|
import random
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from tests.kernels.utils import baseline_scaled_mm
|
||||||
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
|
||||||
|
def cdiv(a, b):
|
||||||
|
return (a + b - 1) // b
|
||||||
|
|
||||||
|
|
||||||
|
def per_token_cast_to_fp8(
|
||||||
|
x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
assert x.dim() == 2
|
||||||
|
m, n = x.shape
|
||||||
|
pad_size = (128 - (n % 128)) % 128
|
||||||
|
x = torch.nn.functional.pad(x,
|
||||||
|
(0, pad_size), value=0) if pad_size > 0 else x
|
||||||
|
x_view = x.view(m, -1, 128)
|
||||||
|
x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
|
||||||
|
fp8_data = (x_view *
|
||||||
|
(448.0 / x_amax.unsqueeze(2))).to(dtype=torch.float8_e4m3fn)
|
||||||
|
return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
|
||||||
|
|
||||||
|
|
||||||
|
def per_block_cast_to_fp8(
|
||||||
|
x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
assert x.dim() == 2
|
||||||
|
m, n = x.shape
|
||||||
|
x_padded = torch.zeros((cdiv(m, 128) * 128, cdiv(n, 128) * 128),
|
||||||
|
device=x.device,
|
||||||
|
dtype=x.dtype)
|
||||||
|
x_padded[:m, :n] = x
|
||||||
|
x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
|
||||||
|
x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
|
||||||
|
x_scaled = (x_view * (448.0 / x_amax)).to(dtype=torch.float8_e4m3fn)
|
||||||
|
return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
|
||||||
|
x_amax / 448.0).view(x_view.size(0), x_view.size(2))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("num_groups, expected_m_per_group, k, n", [
|
||||||
|
(4, 8192, 7168, 4096),
|
||||||
|
(4, 8192, 2048, 7168),
|
||||||
|
(8, 4096, 7168, 4096),
|
||||||
|
(8, 4096, 2048, 7168),
|
||||||
|
(32, 1024, 7168, 4096),
|
||||||
|
(32, 1024, 2048, 7168),
|
||||||
|
])
|
||||||
|
@pytest.mark.parametrize("out_dtype", [torch.float16])
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
(lambda x: x is None or x.to_int() != 100)(
|
||||||
|
current_platform.get_device_capability()),
|
||||||
|
reason="Block Scaled Grouped GEMM is only supported on SM100.")
|
||||||
|
def test_cutlass_grouped_gemm(
|
||||||
|
num_groups: int,
|
||||||
|
expected_m_per_group: int,
|
||||||
|
k: int,
|
||||||
|
n: int,
|
||||||
|
out_dtype: torch.dtype,
|
||||||
|
):
|
||||||
|
device = "cuda"
|
||||||
|
alignment = 128
|
||||||
|
group_ms = [
|
||||||
|
int(expected_m_per_group * random.uniform(0.7, 1.3))
|
||||||
|
for _ in range(num_groups)
|
||||||
|
]
|
||||||
|
m = sum([cdiv(m, alignment) * alignment for m in group_ms])
|
||||||
|
|
||||||
|
x = torch.randn((m, k), device=device, dtype=out_dtype)
|
||||||
|
y = torch.randn((num_groups, n, k), device=device, dtype=out_dtype)
|
||||||
|
out = torch.empty((m, n), device=device, dtype=out_dtype)
|
||||||
|
ref_out = torch.randn((m, n), device=device, dtype=out_dtype)
|
||||||
|
|
||||||
|
ep_offset = [0] + [sum(group_ms[:i]) for i in range(1, num_groups)] + [m]
|
||||||
|
pb_size = []
|
||||||
|
for i in range(num_groups):
|
||||||
|
pb_size.append([ep_offset[i + 1] - ep_offset[i], n, k])
|
||||||
|
problem_sizes = torch.tensor(pb_size, device=device, dtype=torch.int32)
|
||||||
|
expert_offsets = torch.tensor(ep_offset, device=device, dtype=torch.int32)
|
||||||
|
|
||||||
|
x_fp8 = per_token_cast_to_fp8(x)
|
||||||
|
y_fp8 = (torch.empty_like(y, dtype=torch.float8_e4m3fn),
|
||||||
|
torch.empty((num_groups, cdiv(n, 128), k // 128),
|
||||||
|
device=device,
|
||||||
|
dtype=torch.float))
|
||||||
|
for i in range(num_groups):
|
||||||
|
y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i])
|
||||||
|
|
||||||
|
for i in range(num_groups):
|
||||||
|
a = x_fp8[0][ep_offset[i]:ep_offset[i + 1]]
|
||||||
|
a_scale = x_fp8[1][ep_offset[i]:ep_offset[i + 1]]
|
||||||
|
b = y_fp8[0][i].t()
|
||||||
|
b_scale = y_fp8[1][i].t()
|
||||||
|
baseline = baseline_scaled_mm(a, b, a_scale, b_scale, out_dtype)
|
||||||
|
ref_out[ep_offset[i]:ep_offset[i + 1]] = baseline
|
||||||
|
|
||||||
|
ops.cutlass_blockwise_scaled_grouped_mm(
|
||||||
|
out,
|
||||||
|
x_fp8[0],
|
||||||
|
y_fp8[0],
|
||||||
|
x_fp8[1],
|
||||||
|
y_fp8[1],
|
||||||
|
problem_sizes,
|
||||||
|
expert_offsets[:-1],
|
||||||
|
)
|
||||||
|
|
||||||
|
torch.testing.assert_close(ref_out, out, atol=5e-1, rtol=1e-3)
|
||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""
|
"""
|
||||||
Test DeepEP + DeepGEMM integration
|
Test DeepEP + DeepGEMM integration
|
||||||
DeepGEMM are gemm kernels specialized for the
|
DeepGEMM are gemm kernels specialized for the
|
||||||
@@ -148,8 +149,7 @@ def make_ll_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
|
|||||||
|
|
||||||
fused_experts = BatchedDeepGemmExperts(
|
fused_experts = BatchedDeepGemmExperts(
|
||||||
max_num_tokens=max_tokens_per_rank,
|
max_num_tokens=max_tokens_per_rank,
|
||||||
world_size=pgi.world_size,
|
num_dispatchers=pgi.world_size // dp_size,
|
||||||
dp_size=dp_size,
|
|
||||||
block_shape=test_config.block_size,
|
block_shape=test_config.block_size,
|
||||||
per_act_token_quant=test_config.per_act_token_quant)
|
per_act_token_quant=test_config.per_act_token_quant)
|
||||||
mk = FusedMoEModularKernel(prepare_finalize=a2a,
|
mk = FusedMoEModularKernel(prepare_finalize=a2a,
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""
|
"""
|
||||||
Test deepep dispatch-combine logic
|
Test deepep dispatch-combine logic
|
||||||
"""
|
"""
|
||||||
@@ -154,12 +155,13 @@ def make_modular_kernel(
|
|||||||
deepep_ht_args = ht_args,
|
deepep_ht_args = ht_args,
|
||||||
deepep_ll_args = ll_args)
|
deepep_ll_args = ll_args)
|
||||||
|
|
||||||
|
num_dispatchers = pgi.world_size // dp_size
|
||||||
|
|
||||||
if low_latency_mode:
|
if low_latency_mode:
|
||||||
assert not per_act_token_quant, "not supported in ll mode"
|
assert not per_act_token_quant, "not supported in ll mode"
|
||||||
fused_experts = BatchedTritonExperts(
|
fused_experts = BatchedTritonExperts(
|
||||||
max_num_tokens=MAX_TOKENS_PER_RANK,
|
max_num_tokens=MAX_TOKENS_PER_RANK,
|
||||||
world_size=pgi.world_size,
|
num_dispatchers=num_dispatchers,
|
||||||
dp_size=dp_size,
|
|
||||||
use_fp8_w8a8=is_quantized,
|
use_fp8_w8a8=is_quantized,
|
||||||
use_int8_w8a8=False,
|
use_int8_w8a8=False,
|
||||||
use_int8_w8a16=False,
|
use_int8_w8a16=False,
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""
|
"""
|
||||||
Unit-test DeepGEMM FP8 kernels (no DeepEP).
|
Unit-test DeepGEMM FP8 kernels (no DeepEP).
|
||||||
Compare DeepGEMM path against the Triton fallback inside vLLM's fused_experts.
|
Compare DeepGEMM path against the Triton fallback inside vLLM's fused_experts.
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
|
|||||||
from vllm.model_executor.layers.fused_moe.modular_kernel import (
|
from vllm.model_executor.layers.fused_moe.modular_kernel import (
|
||||||
FusedMoEModularKernel)
|
FusedMoEModularKernel)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.utils import cdiv
|
||||||
|
|
||||||
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
||||||
|
|
||||||
@@ -112,18 +113,21 @@ def pplx_cutlass_moe(
|
|||||||
w2_scale = w2_scale.to(device)
|
w2_scale = w2_scale.to(device)
|
||||||
a1_scale = a1_scale.to(device)
|
a1_scale = a1_scale.to(device)
|
||||||
|
|
||||||
|
assert num_experts % world_size == 0
|
||||||
|
num_local_experts = cdiv(num_experts, world_size)
|
||||||
|
num_dispatchers = pgi.world_size // dp_size
|
||||||
|
|
||||||
prepare_finalize = PplxPrepareAndFinalize(
|
prepare_finalize = PplxPrepareAndFinalize(
|
||||||
ata,
|
ata,
|
||||||
max_num_tokens,
|
max_num_tokens=max_num_tokens,
|
||||||
pgi.world_size,
|
num_local_experts=num_local_experts,
|
||||||
rank,
|
num_dispatchers=num_dispatchers)
|
||||||
dp_size,
|
|
||||||
)
|
|
||||||
|
|
||||||
experts = CutlassExpertsFp8((num_experts + world_size - 1) // world_size,
|
experts = CutlassExpertsFp8(num_local_experts,
|
||||||
out_dtype,
|
out_dtype,
|
||||||
per_act_token,
|
per_act_token,
|
||||||
per_out_ch,
|
per_out_ch,
|
||||||
|
num_dispatchers=num_dispatchers,
|
||||||
use_batched_format=True)
|
use_batched_format=True)
|
||||||
|
|
||||||
fused_cutlass_experts = FusedMoEModularKernel(
|
fused_cutlass_experts = FusedMoEModularKernel(
|
||||||
@@ -181,35 +185,40 @@ def _pplx_moe(
|
|||||||
per_out_ch: bool,
|
per_out_ch: bool,
|
||||||
use_internode: bool,
|
use_internode: bool,
|
||||||
):
|
):
|
||||||
if use_internode:
|
try:
|
||||||
uid = nvshmem_get_unique_id(
|
if use_internode:
|
||||||
) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
|
uid = nvshmem_get_unique_id(
|
||||||
torch.distributed.broadcast(uid, src=0)
|
) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
|
||||||
nvshmem_init(uid, pgi.rank, pgi.world_size)
|
torch.distributed.broadcast(uid, src=0)
|
||||||
else:
|
nvshmem_init(uid, pgi.rank, pgi.world_size)
|
||||||
group_ranks = list(range(pgi.world_size))
|
else:
|
||||||
cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
|
group_ranks = list(range(pgi.world_size))
|
||||||
group_name = cpu_group.group_name
|
cpu_group = torch.distributed.new_group(group_ranks,
|
||||||
|
backend="gloo")
|
||||||
|
group_name = cpu_group.group_name
|
||||||
|
|
||||||
with set_current_vllm_config(vllm_config):
|
with set_current_vllm_config(vllm_config):
|
||||||
torch_output = torch_experts(a_full, w1_full, w2_full, topk_weights,
|
torch_output = torch_experts(a_full, w1_full, w2_full,
|
||||||
topk_ids)
|
topk_weights, topk_ids)
|
||||||
pplx_output = pplx_cutlass_moe(pgi, dp_size, a, w1, w2, w1_scale,
|
pplx_output = pplx_cutlass_moe(pgi, dp_size, a, w1, w2, w1_scale,
|
||||||
w2_scale, topk_weights, topk_ids,
|
w2_scale, topk_weights, topk_ids,
|
||||||
a1_scale, out_dtype, per_act_token,
|
a1_scale, out_dtype, per_act_token,
|
||||||
per_out_ch, group_name)
|
per_out_ch, group_name)
|
||||||
|
|
||||||
torch_output = chunk_by_rank(torch_output, pgi.rank,
|
torch_output = chunk_by_rank(torch_output, pgi.rank,
|
||||||
pgi.world_size).to(pplx_output.device)
|
pgi.world_size).to(pplx_output.device)
|
||||||
|
|
||||||
# Uncomment if more debugging is needed
|
# Uncomment if more debugging is needed
|
||||||
# print("PPLX OUT:", pplx_output)
|
# print("PPLX OUT:", pplx_output)
|
||||||
# print("TORCH OUT:", torch_output)
|
# print("TORCH OUT:", torch_output)
|
||||||
|
|
||||||
torch.testing.assert_close(pplx_output, torch_output, atol=0.05, rtol=0)
|
torch.testing.assert_close(pplx_output,
|
||||||
|
torch_output,
|
||||||
if use_internode:
|
atol=0.05,
|
||||||
nvshmem_finalize()
|
rtol=0)
|
||||||
|
finally:
|
||||||
|
if use_internode:
|
||||||
|
nvshmem_finalize()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("m", [2, 224])
|
@pytest.mark.parametrize("m", [2, 224])
|
||||||
|
|||||||
@@ -4,7 +4,10 @@
|
|||||||
|
|
||||||
Run `pytest tests/kernels/test_pplx_moe.py`.
|
Run `pytest tests/kernels/test_pplx_moe.py`.
|
||||||
"""
|
"""
|
||||||
from typing import Optional
|
import itertools
|
||||||
|
import textwrap
|
||||||
|
import traceback
|
||||||
|
from typing import Callable, Optional
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
@@ -19,12 +22,13 @@ except ImportError:
|
|||||||
has_pplx = False
|
has_pplx = False
|
||||||
|
|
||||||
from tests.kernels.moe.utils import make_test_weights, naive_batched_moe
|
from tests.kernels.moe.utils import make_test_weights, naive_batched_moe
|
||||||
|
from tests.kernels.quant_utils import dequant
|
||||||
from tests.kernels.utils import torch_experts
|
from tests.kernels.utils import torch_experts
|
||||||
from vllm.config import VllmConfig, set_current_vllm_config
|
from vllm.config import VllmConfig, set_current_vllm_config
|
||||||
from vllm.model_executor.layers.fused_moe import fused_topk, override_config
|
from vllm.model_executor.layers.fused_moe import fused_topk, override_config
|
||||||
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
|
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
|
||||||
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
|
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
|
||||||
BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
|
BatchedTritonExperts)
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import get_default_config
|
from vllm.model_executor.layers.fused_moe.fused_moe import get_default_config
|
||||||
from vllm.model_executor.layers.fused_moe.modular_kernel import (
|
from vllm.model_executor.layers.fused_moe.modular_kernel import (
|
||||||
FusedMoEModularKernel)
|
FusedMoEModularKernel)
|
||||||
@@ -38,22 +42,22 @@ requires_pplx = pytest.mark.skipif(
|
|||||||
reason="Requires PPLX kernels",
|
reason="Requires PPLX kernels",
|
||||||
)
|
)
|
||||||
|
|
||||||
PPLX_PREPARE_COMBOS = [(4, 128, 128), (32, 1024, 512), (64, 1024, 512),
|
PPLX_COMBOS = [
|
||||||
(222, 2048, 1024)]
|
# TODO: figure out why this fails, seems to be test problem
|
||||||
|
#(1, 128, 128),
|
||||||
PPLX_MOE_COMBOS = [
|
|
||||||
(1, 128, 128),
|
|
||||||
(2, 128, 512),
|
(2, 128, 512),
|
||||||
(3, 1024, 2048),
|
(3, 1024, 2048),
|
||||||
(32, 128, 1024),
|
(4, 128, 128),
|
||||||
|
(32, 1024, 512),
|
||||||
(45, 512, 2048),
|
(45, 512, 2048),
|
||||||
(64, 1024, 1024),
|
(64, 1024, 512),
|
||||||
(222, 1024, 2048),
|
(222, 2048, 1024),
|
||||||
|
(256, 1408, 2048),
|
||||||
]
|
]
|
||||||
|
|
||||||
NUM_EXPERTS = [8, 64]
|
NUM_EXPERTS = [8, 64]
|
||||||
EP_SIZE = [1, 4]
|
|
||||||
TOP_KS = [1, 2, 6]
|
TOP_KS = [1, 2, 6]
|
||||||
|
DTYPES = [torch.float8_e4m3fn, torch.bfloat16]
|
||||||
|
|
||||||
vllm_config = VllmConfig()
|
vllm_config = VllmConfig()
|
||||||
vllm_config.scheduler_config.max_num_seqs = 128
|
vllm_config.scheduler_config.max_num_seqs = 128
|
||||||
@@ -169,9 +173,11 @@ def test_fused_moe_batched_experts(
|
|||||||
|
|
||||||
with set_current_vllm_config(vllm_config):
|
with set_current_vllm_config(vllm_config):
|
||||||
topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
|
topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
|
||||||
baseline_output = torch_experts(a, w1, w2, topk_weight, topk_ids)
|
baseline_output = torch_experts(a, w1, w2, topk_weight,
|
||||||
|
topk_ids) # only for baseline
|
||||||
torch_output = torch_batched_moe(a, w1, w2, topk_weight, topk_ids)
|
torch_output = torch_batched_moe(a, w1, w2, topk_weight, topk_ids)
|
||||||
batched_output = naive_batched_moe(a, w1, w2, topk_weight, topk_ids)
|
batched_output = naive_batched_moe(
|
||||||
|
a, w1, w2, topk_weight, topk_ids) # pick torch_experts or this
|
||||||
|
|
||||||
torch.testing.assert_close(baseline_output,
|
torch.testing.assert_close(baseline_output,
|
||||||
torch_output,
|
torch_output,
|
||||||
@@ -183,6 +189,63 @@ def test_fused_moe_batched_experts(
|
|||||||
rtol=0)
|
rtol=0)
|
||||||
|
|
||||||
|
|
||||||
|
def create_pplx_prepare_finalize(
|
||||||
|
num_tokens: int,
|
||||||
|
hidden_dim: int,
|
||||||
|
topk: int,
|
||||||
|
num_experts: int,
|
||||||
|
rank: int,
|
||||||
|
dp_size: int,
|
||||||
|
world_size: int,
|
||||||
|
in_dtype: torch.dtype,
|
||||||
|
quant_dtype: Optional[torch.dtype],
|
||||||
|
block_shape: Optional[list[int]],
|
||||||
|
per_act_token_quant: bool,
|
||||||
|
group_name: Optional[str],
|
||||||
|
):
|
||||||
|
from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
|
||||||
|
PplxPrepareAndFinalize, pplx_hidden_dim_scale_bytes)
|
||||||
|
|
||||||
|
max_num_tokens = max(rank_chunk(num_tokens, 0, world_size), 1)
|
||||||
|
num_local_experts = rank_chunk(num_experts, 0, world_size)
|
||||||
|
|
||||||
|
hidden_dim_bytes, scale_bytes = pplx_hidden_dim_scale_bytes(
|
||||||
|
max_num_tokens,
|
||||||
|
hidden_dim,
|
||||||
|
in_dtype,
|
||||||
|
quant_dtype,
|
||||||
|
per_act_token_quant=per_act_token_quant,
|
||||||
|
block_shape=block_shape,
|
||||||
|
)
|
||||||
|
|
||||||
|
args = dict(
|
||||||
|
max_num_tokens=max_num_tokens,
|
||||||
|
num_experts=num_experts,
|
||||||
|
experts_per_token=topk,
|
||||||
|
rank=rank,
|
||||||
|
world_size=world_size,
|
||||||
|
dp_size=dp_size,
|
||||||
|
hidden_dim=hidden_dim,
|
||||||
|
hidden_dim_bytes=hidden_dim_bytes,
|
||||||
|
hidden_dim_scale_bytes=scale_bytes,
|
||||||
|
)
|
||||||
|
|
||||||
|
if group_name is None:
|
||||||
|
ata = AllToAll.internode(**args)
|
||||||
|
else:
|
||||||
|
args["group_name"] = group_name
|
||||||
|
ata = AllToAll.intranode(**args)
|
||||||
|
|
||||||
|
prepare_finalize = PplxPrepareAndFinalize(
|
||||||
|
ata,
|
||||||
|
max_num_tokens=max_num_tokens,
|
||||||
|
num_local_experts=num_local_experts,
|
||||||
|
num_dispatchers=world_size // dp_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
return prepare_finalize, ata
|
||||||
|
|
||||||
|
|
||||||
def rank_chunk(num: int, r: int, w: int) -> int:
|
def rank_chunk(num: int, r: int, w: int) -> int:
|
||||||
rem = num % w
|
rem = num % w
|
||||||
return (num // w) + (1 if r < rem else 0)
|
return (num // w) + (1 if r < rem else 0)
|
||||||
@@ -193,6 +256,35 @@ def chunk_by_rank(t: torch.Tensor, r: int, w: int) -> torch.Tensor:
|
|||||||
return t[(r * chunk):(r + 1) * chunk]
|
return t[(r * chunk):(r + 1) * chunk]
|
||||||
|
|
||||||
|
|
||||||
|
def maybe_chunk_by_rank(t: Optional[torch.Tensor], r: int,
|
||||||
|
w: int) -> Optional[torch.Tensor]:
|
||||||
|
if t is not None:
|
||||||
|
return chunk_by_rank(t, r, w)
|
||||||
|
else:
|
||||||
|
return t
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_scales_by_rank(t: Optional[torch.Tensor], r: int,
|
||||||
|
w: int) -> Optional[torch.Tensor]:
|
||||||
|
if t is not None and t.numel() > 1:
|
||||||
|
chunk = rank_chunk(t.shape[0], r, w)
|
||||||
|
return t[(r * chunk):(r + 1) * chunk]
|
||||||
|
else:
|
||||||
|
return t
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_scales(t: Optional[torch.Tensor], start: int,
|
||||||
|
end: int) -> Optional[torch.Tensor]:
|
||||||
|
if t is not None and t.numel() > 1:
|
||||||
|
return t[start:end]
|
||||||
|
else:
|
||||||
|
return t
|
||||||
|
|
||||||
|
|
||||||
|
def dummy_work(a: torch.Tensor) -> torch.Tensor:
|
||||||
|
return a * 1.1
|
||||||
|
|
||||||
|
|
||||||
def pplx_prepare_finalize(
|
def pplx_prepare_finalize(
|
||||||
pgi: ProcessGroupInfo,
|
pgi: ProcessGroupInfo,
|
||||||
dp_size: int,
|
dp_size: int,
|
||||||
@@ -200,11 +292,11 @@ def pplx_prepare_finalize(
|
|||||||
topk_weight: torch.Tensor,
|
topk_weight: torch.Tensor,
|
||||||
topk_ids: torch.Tensor,
|
topk_ids: torch.Tensor,
|
||||||
num_experts: int,
|
num_experts: int,
|
||||||
|
quant_dtype: Optional[torch.dtype],
|
||||||
|
block_shape: Optional[list[int]],
|
||||||
|
per_act_token_quant: bool,
|
||||||
group_name: Optional[str],
|
group_name: Optional[str],
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
|
|
||||||
PplxPrepareAndFinalize)
|
|
||||||
|
|
||||||
assert torch.cuda.current_device() == pgi.local_rank
|
assert torch.cuda.current_device() == pgi.local_rank
|
||||||
|
|
||||||
topk = topk_ids.shape[1]
|
topk = topk_ids.shape[1]
|
||||||
@@ -212,60 +304,66 @@ def pplx_prepare_finalize(
|
|||||||
device = pgi.device
|
device = pgi.device
|
||||||
rank = pgi.rank
|
rank = pgi.rank
|
||||||
world_size = pgi.world_size
|
world_size = pgi.world_size
|
||||||
max_num_tokens = rank_chunk(num_tokens, 0, world_size)
|
|
||||||
|
|
||||||
args = dict(
|
|
||||||
max_num_tokens=max_num_tokens,
|
|
||||||
num_experts=num_experts,
|
|
||||||
experts_per_token=topk,
|
|
||||||
rank=rank,
|
|
||||||
world_size=world_size,
|
|
||||||
dp_size=dp_size,
|
|
||||||
hidden_dim=hidden_dim,
|
|
||||||
hidden_dim_bytes=hidden_dim * a.dtype.itemsize,
|
|
||||||
hidden_dim_scale_bytes=0,
|
|
||||||
)
|
|
||||||
|
|
||||||
if group_name is None:
|
|
||||||
ata = AllToAll.internode(**args)
|
|
||||||
else:
|
|
||||||
args["group_name"] = group_name
|
|
||||||
ata = AllToAll.intranode(**args)
|
|
||||||
|
|
||||||
topk_ids = topk_ids.to(dtype=torch.uint32)
|
topk_ids = topk_ids.to(dtype=torch.uint32)
|
||||||
|
|
||||||
prepare_finalize = PplxPrepareAndFinalize(
|
prepare_finalize, ata = create_pplx_prepare_finalize(
|
||||||
ata,
|
num_tokens,
|
||||||
max_num_tokens,
|
hidden_dim,
|
||||||
world_size,
|
topk,
|
||||||
|
num_experts,
|
||||||
rank,
|
rank,
|
||||||
dp_size,
|
dp_size,
|
||||||
|
world_size,
|
||||||
|
a.dtype,
|
||||||
|
quant_dtype,
|
||||||
|
block_shape,
|
||||||
|
per_act_token_quant,
|
||||||
|
group_name,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
assert a.shape[0] == topk_ids.shape[0]
|
||||||
|
|
||||||
a_chunk = chunk_by_rank(a, rank, world_size).to(device)
|
a_chunk = chunk_by_rank(a, rank, world_size).to(device)
|
||||||
chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
|
chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
|
||||||
chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
|
chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
|
||||||
|
|
||||||
|
assert a_chunk.shape[0] == chunk_topk_ids.shape[0]
|
||||||
|
|
||||||
|
out = torch.full(
|
||||||
|
a_chunk.shape,
|
||||||
|
torch.nan,
|
||||||
|
dtype=a.dtype,
|
||||||
|
device=device,
|
||||||
|
)
|
||||||
|
|
||||||
|
if (quant_dtype is not None and not per_act_token_quant
|
||||||
|
and block_shape is None):
|
||||||
|
a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
|
a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
|
else:
|
||||||
|
a1_scale = None
|
||||||
|
a2_scale = None
|
||||||
|
|
||||||
b_a, b_a_scale, expert_num_tokens, _, _ = prepare_finalize.prepare(
|
b_a, b_a_scale, expert_num_tokens, _, _ = prepare_finalize.prepare(
|
||||||
a_chunk,
|
a_chunk,
|
||||||
None,
|
a1_scale,
|
||||||
None,
|
a2_scale,
|
||||||
chunk_topk_weight,
|
chunk_topk_weight,
|
||||||
chunk_topk_ids,
|
chunk_topk_ids,
|
||||||
num_experts,
|
num_experts,
|
||||||
None,
|
None,
|
||||||
False,
|
False,
|
||||||
FusedMoEQuantConfig(),
|
FusedMoEQuantConfig(
|
||||||
|
quant_dtype,
|
||||||
|
per_act_token_quant,
|
||||||
|
False,
|
||||||
|
block_shape,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
b_a = b_a * 1.5
|
b_a = dummy_work(
|
||||||
|
dequant(b_a, b_a_scale, block_shape, per_act_token_quant, a.dtype))
|
||||||
out = torch.full(
|
|
||||||
(max_num_tokens, hidden_dim),
|
|
||||||
torch.nan,
|
|
||||||
dtype=a.dtype,
|
|
||||||
device=device,
|
|
||||||
)
|
|
||||||
|
|
||||||
prepare_finalize.finalize(
|
prepare_finalize.finalize(
|
||||||
out,
|
out,
|
||||||
@@ -291,70 +389,96 @@ def _pplx_prepare_finalize(
|
|||||||
score: torch.Tensor,
|
score: torch.Tensor,
|
||||||
topk: torch.Tensor,
|
topk: torch.Tensor,
|
||||||
num_experts: int,
|
num_experts: int,
|
||||||
|
quant_dtype: Optional[torch.dtype],
|
||||||
|
block_shape: Optional[list[int]],
|
||||||
|
per_act_token_quant: bool,
|
||||||
use_internode: bool,
|
use_internode: bool,
|
||||||
):
|
):
|
||||||
if use_internode:
|
try:
|
||||||
uid = nvshmem_get_unique_id(
|
if use_internode:
|
||||||
) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
|
uid = nvshmem_get_unique_id(
|
||||||
torch.distributed.broadcast(uid, src=0)
|
) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
|
||||||
nvshmem_init(uid, pgi.rank, pgi.world_size)
|
torch.distributed.broadcast(uid, src=0)
|
||||||
group_name = None
|
nvshmem_init(uid, pgi.rank, pgi.world_size)
|
||||||
else:
|
group_name = None
|
||||||
group_ranks = list(range(pgi.world_size))
|
else:
|
||||||
cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
|
group_ranks = list(range(pgi.world_size))
|
||||||
group_name = cpu_group.group_name
|
cpu_group = torch.distributed.new_group(group_ranks,
|
||||||
|
backend="gloo")
|
||||||
|
group_name = cpu_group.group_name
|
||||||
|
|
||||||
device = pgi.device
|
topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
|
||||||
|
m, k = a.shape
|
||||||
|
|
||||||
topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
|
a_rep = torch.repeat_interleave(dummy_work(a), topk, dim=0)
|
||||||
k = a.shape[1]
|
|
||||||
|
|
||||||
a_rep = torch.repeat_interleave(a, topk, dim=0).to(device)
|
torch_output = (a_rep.view(m, topk, k) *
|
||||||
|
topk_weight.view(m, topk, 1).to(a_rep.dtype)).sum(
|
||||||
|
dim=1)
|
||||||
|
|
||||||
torch_output = (a_rep.view(-1, topk, k) * 1.5 *
|
pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight,
|
||||||
topk_weight.view(-1, topk, 1).to(device)).sum(dim=1).to(
|
topk_ids, num_experts, quant_dtype,
|
||||||
a.dtype)
|
block_shape, per_act_token_quant,
|
||||||
|
group_name)
|
||||||
|
|
||||||
pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight, topk_ids,
|
torch_output = chunk_by_rank(torch_output, pgi.rank,
|
||||||
num_experts, group_name)
|
pgi.world_size).to(pgi.device)
|
||||||
|
|
||||||
torch_output = chunk_by_rank(torch_output, pgi.rank,
|
torch.testing.assert_close(pplx_output,
|
||||||
pgi.world_size).to(pplx_output.device)
|
torch_output,
|
||||||
|
atol=3e-2,
|
||||||
torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0)
|
rtol=3e-2)
|
||||||
|
finally:
|
||||||
if use_internode:
|
if use_internode:
|
||||||
nvshmem_finalize()
|
nvshmem_finalize()
|
||||||
|
|
||||||
|
|
||||||
# TODO (bnell): this test point does not work for odd M due to how the test is
|
@pytest.mark.parametrize("mnk", PPLX_COMBOS)
|
||||||
# written, not due to limitations of the pplx kernels. The pplx_moe
|
|
||||||
# test below is able to deal with odd M.
|
|
||||||
# TODO (bnell) add fp8 tests
|
|
||||||
@pytest.mark.parametrize("mnk", PPLX_PREPARE_COMBOS)
|
|
||||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||||
@pytest.mark.parametrize("topk", TOP_KS)
|
@pytest.mark.parametrize("topk", TOP_KS)
|
||||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
@pytest.mark.parametrize("dtype", DTYPES)
|
||||||
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
|
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
|
||||||
|
@pytest.mark.parametrize("per_act_token_quant", [False, True])
|
||||||
|
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
|
||||||
@pytest.mark.parametrize("use_internode", [False])
|
@pytest.mark.parametrize("use_internode", [False])
|
||||||
|
@pytest.mark.optional
|
||||||
@requires_pplx
|
@requires_pplx
|
||||||
def test_pplx_prepare_finalize(
|
def test_pplx_prepare_finalize_slow(
|
||||||
mnk: tuple[int, int, int],
|
mnk: tuple[int, int, int],
|
||||||
e: int,
|
e: int,
|
||||||
topk: int,
|
topk: int,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
world_dp_size: tuple[int, int],
|
world_dp_size: tuple[int, int],
|
||||||
|
per_act_token_quant: bool,
|
||||||
|
block_shape: Optional[list[int]],
|
||||||
use_internode: bool,
|
use_internode: bool,
|
||||||
):
|
):
|
||||||
|
if dtype == torch.float8_e4m3fn:
|
||||||
|
use_fp8_w8a8 = True
|
||||||
|
act_dtype = torch.bfloat16
|
||||||
|
quant_dtype = dtype
|
||||||
|
else:
|
||||||
|
use_fp8_w8a8 = False
|
||||||
|
act_dtype = dtype
|
||||||
|
quant_dtype = None
|
||||||
|
|
||||||
|
if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
|
||||||
|
pytest.skip("Skip quantization test for non-quantized type")
|
||||||
|
|
||||||
|
if per_act_token_quant and block_shape is not None:
|
||||||
|
pytest.skip("Skip illegal quantization combination")
|
||||||
|
|
||||||
current_platform.seed_everything(7)
|
current_platform.seed_everything(7)
|
||||||
m, n, k = mnk
|
m, n, k = mnk
|
||||||
world_size, dp_size = world_dp_size
|
world_size, dp_size = world_dp_size
|
||||||
device = "cuda"
|
device = "cuda"
|
||||||
a = torch.randn((m, k), device=device, dtype=dtype) / 10
|
|
||||||
score = torch.randn((m, e), device=device, dtype=dtype)
|
a = torch.randn((m, k), device=device, dtype=act_dtype) / 10
|
||||||
|
score = torch.randn((m, e), device=device, dtype=act_dtype)
|
||||||
|
|
||||||
parallel_launch(world_size, _pplx_prepare_finalize, dp_size, a, score,
|
parallel_launch(world_size, _pplx_prepare_finalize, dp_size, a, score,
|
||||||
topk, e, use_internode)
|
topk, e, quant_dtype, block_shape, per_act_token_quant,
|
||||||
|
use_internode)
|
||||||
|
|
||||||
|
|
||||||
def pplx_moe(
|
def pplx_moe(
|
||||||
@@ -369,84 +493,62 @@ def pplx_moe(
|
|||||||
topk_ids: torch.Tensor,
|
topk_ids: torch.Tensor,
|
||||||
w1_scale: Optional[torch.Tensor] = None,
|
w1_scale: Optional[torch.Tensor] = None,
|
||||||
w2_scale: Optional[torch.Tensor] = None,
|
w2_scale: Optional[torch.Tensor] = None,
|
||||||
qtype: Optional[torch.dtype] = None,
|
a1_scale: Optional[torch.Tensor] = None,
|
||||||
|
a2_scale: Optional[torch.Tensor] = None,
|
||||||
|
quant_dtype: Optional[torch.dtype] = None,
|
||||||
per_act_token_quant=False,
|
per_act_token_quant=False,
|
||||||
block_shape: Optional[list[int]] = None,
|
block_shape: Optional[list[int]] = None,
|
||||||
use_compile: bool = False,
|
use_compile: bool = False,
|
||||||
use_cudagraphs: bool = True,
|
use_cudagraphs: bool = True,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
|
|
||||||
PplxPrepareAndFinalize, pplx_hidden_dim_scale_bytes)
|
|
||||||
|
|
||||||
device = torch.device("cuda", rank)
|
num_tokens, hidden_dim = a.shape
|
||||||
hidden_dim = a.shape[1]
|
|
||||||
num_experts = w1.shape[0]
|
num_experts = w1.shape[0]
|
||||||
topk = topk_ids.shape[1]
|
topk = topk_ids.shape[1]
|
||||||
max_num_tokens = round_up(rank_chunk(a.shape[0], 0, world_size), 64)
|
max_num_tokens = round_up(rank_chunk(a.shape[0], 0, world_size), 16)
|
||||||
|
|
||||||
hidden_dim_bytes, scale_bytes = pplx_hidden_dim_scale_bytes(
|
prepare_finalize, ata = create_pplx_prepare_finalize(
|
||||||
max_num_tokens,
|
num_tokens,
|
||||||
hidden_dim,
|
hidden_dim,
|
||||||
|
topk,
|
||||||
|
num_experts,
|
||||||
|
rank,
|
||||||
|
dp_size,
|
||||||
|
world_size,
|
||||||
a.dtype,
|
a.dtype,
|
||||||
qtype,
|
quant_dtype,
|
||||||
per_act_token_quant=per_act_token_quant,
|
block_shape,
|
||||||
block_shape=block_shape,
|
per_act_token_quant,
|
||||||
|
group_name,
|
||||||
)
|
)
|
||||||
|
|
||||||
args = dict(
|
|
||||||
max_num_tokens=max_num_tokens,
|
|
||||||
num_experts=num_experts,
|
|
||||||
experts_per_token=topk,
|
|
||||||
rank=rank,
|
|
||||||
world_size=world_size,
|
|
||||||
dp_size=dp_size,
|
|
||||||
hidden_dim=hidden_dim,
|
|
||||||
hidden_dim_bytes=hidden_dim_bytes,
|
|
||||||
hidden_dim_scale_bytes=scale_bytes,
|
|
||||||
)
|
|
||||||
|
|
||||||
if group_name is None:
|
|
||||||
ata = AllToAll.internode(**args)
|
|
||||||
else:
|
|
||||||
args["group_name"] = group_name
|
|
||||||
ata = AllToAll.intranode(**args)
|
|
||||||
|
|
||||||
topk_ids = topk_ids.to(dtype=torch.uint32)
|
topk_ids = topk_ids.to(dtype=torch.uint32)
|
||||||
|
|
||||||
prepare_finalize = PplxPrepareAndFinalize(
|
experts = BatchedTritonExperts(
|
||||||
ata,
|
max_num_tokens=max_num_tokens,
|
||||||
max_num_tokens,
|
num_dispatchers=prepare_finalize.num_dispatchers(),
|
||||||
world_size,
|
use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
|
||||||
rank,
|
block_shape=block_shape,
|
||||||
dp_size,
|
per_act_token_quant=per_act_token_quant,
|
||||||
)
|
)
|
||||||
|
|
||||||
experts = BatchedTritonExperts(max_num_tokens=max_num_tokens,
|
|
||||||
world_size=world_size,
|
|
||||||
dp_size=dp_size,
|
|
||||||
use_fp8_w8a8=qtype == torch.float8_e4m3fn,
|
|
||||||
block_shape=block_shape)
|
|
||||||
|
|
||||||
fused_experts = FusedMoEModularKernel(
|
fused_experts = FusedMoEModularKernel(
|
||||||
prepare_finalize,
|
prepare_finalize,
|
||||||
experts,
|
experts,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Note: workers with the same dp_rank must use the exact same inputs.
|
# Note: workers with the same dp_rank must use the exact same inputs.
|
||||||
a_chunk = chunk_by_rank(a, rank, world_size).to(device)
|
a_chunk = chunk_by_rank(a, rank, world_size)
|
||||||
chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
|
chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size)
|
||||||
chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
|
chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size)
|
||||||
|
|
||||||
# Chunking weights like this only works for batched format
|
# Chunking weights like this only works for batched format
|
||||||
w1_chunk = chunk_by_rank(w1, rank, world_size).to(device)
|
w1_chunk = chunk_by_rank(w1, rank, world_size)
|
||||||
w2_chunk = chunk_by_rank(w2, rank, world_size).to(device)
|
w2_chunk = chunk_by_rank(w2, rank, world_size)
|
||||||
|
w1_scale_chunk = maybe_chunk_by_rank(w1_scale, rank, world_size)
|
||||||
if w1_scale is not None:
|
w2_scale_chunk = maybe_chunk_by_rank(w2_scale, rank, world_size)
|
||||||
w1_scale_chunk = chunk_by_rank(w1_scale, rank, world_size).to(device)
|
a1_scale_chunk = chunk_scales_by_rank(a1_scale, rank, world_size)
|
||||||
w2_scale_chunk = chunk_by_rank(w2_scale, rank, world_size).to(device)
|
a2_scale_chunk = chunk_scales_by_rank(a2_scale, rank, world_size)
|
||||||
else:
|
|
||||||
w1_scale_chunk = None
|
|
||||||
w2_scale_chunk = None
|
|
||||||
|
|
||||||
# Note: for now use_compile will error out if the problem size is
|
# Note: for now use_compile will error out if the problem size is
|
||||||
# large enough to trigger chunking. I'm leaving the flag and
|
# large enough to trigger chunking. I'm leaving the flag and
|
||||||
@@ -468,6 +570,8 @@ def pplx_moe(
|
|||||||
chunk_topk_ids,
|
chunk_topk_ids,
|
||||||
w1_scale=w1_scale_chunk,
|
w1_scale=w1_scale_chunk,
|
||||||
w2_scale=w2_scale_chunk,
|
w2_scale=w2_scale_chunk,
|
||||||
|
a1_scale=a1_scale_chunk,
|
||||||
|
a2_scale=a2_scale_chunk,
|
||||||
global_num_experts=num_experts)
|
global_num_experts=num_experts)
|
||||||
|
|
||||||
if use_cudagraphs:
|
if use_cudagraphs:
|
||||||
@@ -482,6 +586,8 @@ def pplx_moe(
|
|||||||
chunk_topk_ids,
|
chunk_topk_ids,
|
||||||
w1_scale=w1_scale_chunk,
|
w1_scale=w1_scale_chunk,
|
||||||
w2_scale=w2_scale_chunk,
|
w2_scale=w2_scale_chunk,
|
||||||
|
a1_scale=a1_scale_chunk,
|
||||||
|
a2_scale=a2_scale_chunk,
|
||||||
global_num_experts=num_experts)
|
global_num_experts=num_experts)
|
||||||
|
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
@@ -494,48 +600,6 @@ def pplx_moe(
|
|||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
def _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids):
|
|
||||||
assert torch.cuda.current_device() == pgi.local_rank
|
|
||||||
|
|
||||||
num_experts = w1.shape[0]
|
|
||||||
device = pgi.device
|
|
||||||
rank = pgi.rank
|
|
||||||
world_size = pgi.world_size
|
|
||||||
max_num_tokens = rank_chunk(a.shape[0], 0, world_size)
|
|
||||||
|
|
||||||
prepare_finalize = BatchedPrepareAndFinalize(
|
|
||||||
max_num_tokens=max_num_tokens,
|
|
||||||
world_size=world_size,
|
|
||||||
dp_size=dp_size,
|
|
||||||
rank=rank,
|
|
||||||
)
|
|
||||||
|
|
||||||
experts = NaiveBatchedExperts(max_num_tokens=a.shape[0],
|
|
||||||
world_size=1,
|
|
||||||
dp_size=1)
|
|
||||||
|
|
||||||
fused_experts = FusedMoEModularKernel(
|
|
||||||
prepare_finalize,
|
|
||||||
experts,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Note: workers with the same dp_rank must use the exact same inputs.
|
|
||||||
a_chunk = chunk_by_rank(a, rank, world_size).to(device)
|
|
||||||
chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
|
|
||||||
chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
|
|
||||||
|
|
||||||
out = fused_experts(
|
|
||||||
a_chunk,
|
|
||||||
# Chunking weights like this only works for batched format
|
|
||||||
chunk_by_rank(w1, rank, world_size).to(device),
|
|
||||||
chunk_by_rank(w2, rank, world_size).to(device),
|
|
||||||
chunk_topk_weight,
|
|
||||||
chunk_topk_ids,
|
|
||||||
global_num_experts=num_experts)
|
|
||||||
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
def _pplx_moe(
|
def _pplx_moe(
|
||||||
pgi: ProcessGroupInfo,
|
pgi: ProcessGroupInfo,
|
||||||
dp_size: int,
|
dp_size: int,
|
||||||
@@ -544,75 +608,130 @@ def _pplx_moe(
|
|||||||
w2: torch.Tensor,
|
w2: torch.Tensor,
|
||||||
score: torch.Tensor,
|
score: torch.Tensor,
|
||||||
topk: int,
|
topk: int,
|
||||||
|
num_experts: int,
|
||||||
w1_s: Optional[torch.Tensor] = None,
|
w1_s: Optional[torch.Tensor] = None,
|
||||||
w2_s: Optional[torch.Tensor] = None,
|
w2_s: Optional[torch.Tensor] = None,
|
||||||
qtype: Optional[torch.dtype] = None,
|
quant_dtype: Optional[torch.dtype] = None,
|
||||||
per_act_token_quant: bool = False,
|
per_act_token_quant: bool = False,
|
||||||
block_shape: Optional[list[int]] = None,
|
block_shape: Optional[list[int]] = None,
|
||||||
use_internode: bool = False,
|
use_internode: bool = False,
|
||||||
):
|
):
|
||||||
if use_internode:
|
try:
|
||||||
uid = nvshmem_get_unique_id(
|
if use_internode:
|
||||||
) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
|
uid = nvshmem_get_unique_id(
|
||||||
torch.distributed.broadcast(uid, src=0)
|
) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
|
||||||
nvshmem_init(uid, pgi.rank, pgi.world_size)
|
torch.distributed.broadcast(uid, src=0)
|
||||||
group_name = None
|
nvshmem_init(uid, pgi.rank, pgi.world_size)
|
||||||
else:
|
group_name = None
|
||||||
group_ranks = list(range(pgi.world_size))
|
else:
|
||||||
cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
|
group_ranks = list(range(pgi.world_size))
|
||||||
group_name = cpu_group.group_name
|
cpu_group = torch.distributed.new_group(group_ranks,
|
||||||
|
backend="gloo")
|
||||||
|
group_name = cpu_group.group_name
|
||||||
|
|
||||||
m, k = a.shape
|
m, k = a.shape
|
||||||
e, _, n = w2.shape
|
e, _, n = w2.shape
|
||||||
|
|
||||||
moe_config = get_default_config(m, e, n, k, topk, a.dtype, False)
|
moe_config = get_default_config(m, e, n, k, topk, a.dtype, False)
|
||||||
|
|
||||||
device = torch.device("cuda", pgi.rank)
|
device = torch.device("cuda", pgi.rank)
|
||||||
a = a.to(device)
|
rank = pgi.rank
|
||||||
w1 = w1.to(device)
|
world_size = pgi.world_size
|
||||||
w2 = w2.to(device)
|
|
||||||
w1_s = w1_s.to(device) if w1_s is not None else None
|
|
||||||
w2_s = w2_s.to(device) if w2_s is not None else None
|
|
||||||
|
|
||||||
with set_current_vllm_config(vllm_config), override_config(moe_config):
|
a = a.to(device)
|
||||||
topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
|
w1 = w1.to(device)
|
||||||
torch_output = torch_experts(a,
|
w2 = w2.to(device)
|
||||||
w1,
|
w1_s = w1_s.to(device) if w1_s is not None else None
|
||||||
w2,
|
w2_s = w2_s.to(device) if w2_s is not None else None
|
||||||
topk_weight,
|
|
||||||
topk_ids,
|
|
||||||
w1_scale=w1_s,
|
|
||||||
w2_scale=w2_s,
|
|
||||||
quant_dtype=qtype,
|
|
||||||
per_act_token_quant=per_act_token_quant,
|
|
||||||
block_shape=block_shape)
|
|
||||||
pplx_output = pplx_moe(group_name, pgi.rank, pgi.world_size, dp_size,
|
|
||||||
a, w1, w2, topk_weight, topk_ids, w1_s, w2_s,
|
|
||||||
qtype, per_act_token_quant, block_shape)
|
|
||||||
# TODO (bnell): fix + re-enable
|
|
||||||
#batched_output = _batched_moe(pgi, dp_size, a, w1, w2, topk_weight,
|
|
||||||
# topk_ids)
|
|
||||||
|
|
||||||
torch_output = chunk_by_rank(torch_output, pgi.rank,
|
if (quant_dtype is not None and not per_act_token_quant
|
||||||
pgi.world_size).to(pplx_output.device)
|
and block_shape is None):
|
||||||
|
a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
|
a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
|
else:
|
||||||
|
a1_scale = None
|
||||||
|
a2_scale = None
|
||||||
|
|
||||||
torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0)
|
with set_current_vllm_config(vllm_config), override_config(moe_config):
|
||||||
#torch.testing.assert_close(batched_output, torch_output, atol=2e-2, rtol=0)
|
topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
|
||||||
|
|
||||||
if use_internode:
|
torch_output = torch_experts(
|
||||||
nvshmem_finalize()
|
a,
|
||||||
|
w1,
|
||||||
|
w2,
|
||||||
|
topk_weight,
|
||||||
|
topk_ids,
|
||||||
|
w1_scale=w1_s,
|
||||||
|
w2_scale=w2_s,
|
||||||
|
a1_scale=a1_scale,
|
||||||
|
a2_scale=a2_scale,
|
||||||
|
quant_dtype=quant_dtype,
|
||||||
|
per_act_token_quant=per_act_token_quant,
|
||||||
|
block_shape=block_shape,
|
||||||
|
)
|
||||||
|
|
||||||
|
batched_output = naive_batched_moe(
|
||||||
|
a,
|
||||||
|
w1,
|
||||||
|
w2,
|
||||||
|
topk_weight,
|
||||||
|
topk_ids,
|
||||||
|
w1_scale=w1_s,
|
||||||
|
w2_scale=w2_s,
|
||||||
|
a1_scale=a1_scale,
|
||||||
|
a2_scale=a2_scale,
|
||||||
|
quant_dtype=quant_dtype,
|
||||||
|
per_act_token_quant=per_act_token_quant,
|
||||||
|
block_shape=block_shape,
|
||||||
|
)
|
||||||
|
|
||||||
|
pplx_output = pplx_moe(
|
||||||
|
group_name,
|
||||||
|
rank,
|
||||||
|
world_size,
|
||||||
|
dp_size,
|
||||||
|
a,
|
||||||
|
w1,
|
||||||
|
w2,
|
||||||
|
topk_weight,
|
||||||
|
topk_ids,
|
||||||
|
w1_scale=w1_s,
|
||||||
|
w2_scale=w2_s,
|
||||||
|
a1_scale=a1_scale,
|
||||||
|
a2_scale=a2_scale,
|
||||||
|
quant_dtype=quant_dtype,
|
||||||
|
per_act_token_quant=per_act_token_quant,
|
||||||
|
block_shape=block_shape,
|
||||||
|
)
|
||||||
|
|
||||||
|
chunked_batch_output = chunk_by_rank(
|
||||||
|
batched_output, pgi.rank, pgi.world_size).to(pplx_output.device)
|
||||||
|
|
||||||
|
torch.testing.assert_close(batched_output,
|
||||||
|
torch_output,
|
||||||
|
atol=3e-2,
|
||||||
|
rtol=3e-2)
|
||||||
|
|
||||||
|
torch.testing.assert_close(pplx_output,
|
||||||
|
chunked_batch_output,
|
||||||
|
atol=3e-2,
|
||||||
|
rtol=3e-2)
|
||||||
|
finally:
|
||||||
|
if use_internode:
|
||||||
|
nvshmem_finalize()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("mnk", PPLX_MOE_COMBOS)
|
@pytest.mark.parametrize("mnk", PPLX_COMBOS)
|
||||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||||
@pytest.mark.parametrize("topk", TOP_KS)
|
@pytest.mark.parametrize("topk", TOP_KS)
|
||||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
@pytest.mark.parametrize("dtype", DTYPES)
|
||||||
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
|
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
|
||||||
@pytest.mark.parametrize("per_act_token_quant", [False, True])
|
@pytest.mark.parametrize("per_act_token_quant", [False, True])
|
||||||
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
|
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
|
||||||
@pytest.mark.parametrize("use_internode", [False])
|
@pytest.mark.parametrize("use_internode", [False])
|
||||||
|
@pytest.mark.optional
|
||||||
@requires_pplx
|
@requires_pplx
|
||||||
def test_pplx_moe(
|
def test_pplx_moe_slow(
|
||||||
mnk: tuple[int, int, int],
|
mnk: tuple[int, int, int],
|
||||||
e: int,
|
e: int,
|
||||||
topk: int,
|
topk: int,
|
||||||
@@ -633,18 +752,143 @@ def test_pplx_moe(
|
|||||||
use_fp8_w8a8 = False
|
use_fp8_w8a8 = False
|
||||||
quant_dtype = None
|
quant_dtype = None
|
||||||
|
|
||||||
if not use_fp8_w8a8 and per_act_token_quant and block_shape is not None:
|
if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
|
||||||
pytest.skip("Skip quantization test for non-quantized type")
|
pytest.skip("Skip quantization test for non-quantized type")
|
||||||
|
|
||||||
|
if per_act_token_quant and block_shape is not None:
|
||||||
|
pytest.skip("Skip illegal quantization combination")
|
||||||
|
|
||||||
a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
|
a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
|
||||||
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
|
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
|
||||||
|
|
||||||
_, w1, w1_s, _, w2, w2_s = make_test_weights(e,
|
_, w1, w1_s, _, w2, w2_s = make_test_weights(
|
||||||
n,
|
e,
|
||||||
k,
|
n,
|
||||||
quant_dtype=quant_dtype,
|
k,
|
||||||
block_shape=block_shape)
|
quant_dtype=quant_dtype,
|
||||||
|
block_shape=block_shape,
|
||||||
|
per_act_token_quant=per_act_token_quant,
|
||||||
|
)
|
||||||
|
|
||||||
parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk,
|
parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk, e,
|
||||||
w1_s, w2_s, quant_dtype, per_act_token_quant, block_shape,
|
w1_s, w2_s, quant_dtype, per_act_token_quant, block_shape,
|
||||||
use_internode)
|
use_internode)
|
||||||
|
|
||||||
|
|
||||||
|
def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
|
||||||
|
make_weights: bool, test_fn: Callable):
|
||||||
|
|
||||||
|
def format_result(msg, ex=None):
|
||||||
|
if ex is not None:
|
||||||
|
x = str(ex)
|
||||||
|
newx = x.strip(" \n\t")[:16]
|
||||||
|
if len(newx) < len(x):
|
||||||
|
newx = newx + " ..."
|
||||||
|
|
||||||
|
prefix = "E\t"
|
||||||
|
print(f"{textwrap.indent(traceback.format_exc(), prefix)}")
|
||||||
|
print(f"FAILED {msg} - {newx}\n")
|
||||||
|
else:
|
||||||
|
print(f"PASSED {msg}")
|
||||||
|
|
||||||
|
current_platform.seed_everything(7)
|
||||||
|
combos = itertools.product(PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES,
|
||||||
|
[False, True], [None, [128, 128]])
|
||||||
|
exceptions = []
|
||||||
|
count = 0
|
||||||
|
for mnk, e, topk, dtype, per_act_token_quant, block_shape in combos:
|
||||||
|
count = count + 1
|
||||||
|
m, n, k = mnk
|
||||||
|
|
||||||
|
if dtype == torch.float8_e4m3fn:
|
||||||
|
use_fp8_w8a8 = True
|
||||||
|
quant_dtype = dtype
|
||||||
|
else:
|
||||||
|
use_fp8_w8a8 = False
|
||||||
|
quant_dtype = None
|
||||||
|
|
||||||
|
test_desc = (f"test_pplx_moe[mnk={mnk}, e={e}, topk={topk}, "
|
||||||
|
f"dtype={dtype}, per_act_token={per_act_token_quant}, "
|
||||||
|
f"block_shape={block_shape}")
|
||||||
|
|
||||||
|
if not use_fp8_w8a8 and (per_act_token_quant
|
||||||
|
or block_shape is not None):
|
||||||
|
print(
|
||||||
|
f"{test_desc} - Skip quantization test for non-quantized type."
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if per_act_token_quant and block_shape is not None:
|
||||||
|
print(f"{test_desc} - Skip illegal quantization combination.")
|
||||||
|
continue
|
||||||
|
|
||||||
|
a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
|
||||||
|
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
|
||||||
|
|
||||||
|
args = dict()
|
||||||
|
if make_weights:
|
||||||
|
_, w1, w1_s, _, w2, w2_s = make_test_weights(
|
||||||
|
e,
|
||||||
|
n,
|
||||||
|
k,
|
||||||
|
quant_dtype=quant_dtype,
|
||||||
|
block_shape=block_shape,
|
||||||
|
per_act_token_quant=per_act_token_quant,
|
||||||
|
)
|
||||||
|
args["w1"] = w1
|
||||||
|
args["w2"] = w2
|
||||||
|
args["w1_s"] = w1_s
|
||||||
|
args["w2_s"] = w2_s
|
||||||
|
|
||||||
|
try:
|
||||||
|
test_fn(
|
||||||
|
pgi=pgi,
|
||||||
|
dp_size=dp_size,
|
||||||
|
a=a,
|
||||||
|
score=score,
|
||||||
|
topk=topk,
|
||||||
|
num_experts=e,
|
||||||
|
quant_dtype=quant_dtype,
|
||||||
|
per_act_token_quant=per_act_token_quant,
|
||||||
|
block_shape=block_shape,
|
||||||
|
use_internode=use_internode,
|
||||||
|
**args,
|
||||||
|
)
|
||||||
|
format_result(test_desc)
|
||||||
|
except Exception as ex:
|
||||||
|
format_result(test_desc, ex)
|
||||||
|
exceptions.append(ex)
|
||||||
|
|
||||||
|
if len(exceptions) > 0:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"{len(exceptions)} of {count} tests failed in child process, "
|
||||||
|
f"rank={pgi.rank}.")
|
||||||
|
else:
|
||||||
|
print(f"{count} of {count} tests passed in child process, "
|
||||||
|
f"rank={pgi.rank}.")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
|
||||||
|
@pytest.mark.parametrize("use_internode", [False])
|
||||||
|
@requires_pplx
|
||||||
|
def test_pplx_prepare_finalize(
|
||||||
|
world_dp_size: tuple[int, int],
|
||||||
|
use_internode: bool,
|
||||||
|
):
|
||||||
|
current_platform.seed_everything(7)
|
||||||
|
world_size, dp_size = world_dp_size
|
||||||
|
parallel_launch(world_size * dp_size, _pplx_test_loop, dp_size,
|
||||||
|
use_internode, False, _pplx_prepare_finalize)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
|
||||||
|
@pytest.mark.parametrize("use_internode", [False])
|
||||||
|
@requires_pplx
|
||||||
|
def test_pplx_moe(
|
||||||
|
world_dp_size: tuple[int, int],
|
||||||
|
use_internode: bool,
|
||||||
|
):
|
||||||
|
current_platform.seed_everything(7)
|
||||||
|
world_size, dp_size = world_dp_size
|
||||||
|
parallel_launch(world_size, _pplx_test_loop, dp_size, use_internode, True,
|
||||||
|
_pplx_moe)
|
||||||
|
|||||||
@@ -63,13 +63,12 @@ def batched_moe(
|
|||||||
|
|
||||||
fused_experts = FusedMoEModularKernel(
|
fused_experts = FusedMoEModularKernel(
|
||||||
BatchedPrepareAndFinalize(max_num_tokens,
|
BatchedPrepareAndFinalize(max_num_tokens,
|
||||||
world_size=1,
|
num_dispatchers=1,
|
||||||
dp_size=1,
|
num_local_experts=w1.shape[0],
|
||||||
rank=0),
|
rank=0),
|
||||||
BatchedTritonExperts(
|
BatchedTritonExperts(
|
||||||
max_num_tokens=max_num_tokens,
|
max_num_tokens=max_num_tokens,
|
||||||
world_size=1,
|
num_dispatchers=1,
|
||||||
dp_size=1,
|
|
||||||
use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
|
use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
|
||||||
per_act_token_quant=per_act_token_quant,
|
per_act_token_quant=per_act_token_quant,
|
||||||
block_shape=block_shape,
|
block_shape=block_shape,
|
||||||
@@ -105,13 +104,12 @@ def naive_batched_moe(
|
|||||||
|
|
||||||
fused_experts = FusedMoEModularKernel(
|
fused_experts = FusedMoEModularKernel(
|
||||||
BatchedPrepareAndFinalize(max_num_tokens,
|
BatchedPrepareAndFinalize(max_num_tokens,
|
||||||
world_size=1,
|
num_dispatchers=1,
|
||||||
dp_size=1,
|
num_local_experts=w1.shape[0],
|
||||||
rank=0),
|
rank=0),
|
||||||
NaiveBatchedExperts(
|
NaiveBatchedExperts(
|
||||||
max_num_tokens=max_num_tokens,
|
max_num_tokens=max_num_tokens,
|
||||||
dp_size=1,
|
num_dispatchers=1,
|
||||||
world_size=1,
|
|
||||||
use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
|
use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
|
||||||
per_act_token_quant=per_act_token_quant,
|
per_act_token_quant=per_act_token_quant,
|
||||||
block_shape=block_shape,
|
block_shape=block_shape,
|
||||||
|
|||||||
@@ -277,6 +277,24 @@ def dequant(
|
|||||||
return t.to(out_dtype)
|
return t.to(out_dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def batched_dequant(
|
||||||
|
t: torch.Tensor,
|
||||||
|
scale: Optional[torch.Tensor],
|
||||||
|
block_shape: Optional[list[int]],
|
||||||
|
per_act_token_quant: bool,
|
||||||
|
out_dtype: Optional[torch.dtype] = torch.float32,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
if scale is not None:
|
||||||
|
assert t.shape[0] == scale.shape[0]
|
||||||
|
out = torch.empty_like(t, dtype=out_dtype)
|
||||||
|
for e in range(t.shape[0]):
|
||||||
|
out[e] = dequant(t[e], scale[e], block_shape, per_act_token_quant,
|
||||||
|
out_dtype)
|
||||||
|
return out
|
||||||
|
|
||||||
|
return t.to(out_dtype)
|
||||||
|
|
||||||
|
|
||||||
def native_batched_masked_quant_matmul(
|
def native_batched_masked_quant_matmul(
|
||||||
A: torch.Tensor,
|
A: torch.Tensor,
|
||||||
B: torch.Tensor,
|
B: torch.Tensor,
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@@ -74,3 +75,51 @@ def test_apply_repetition_penalties(
|
|||||||
# Test the operator by applying the opcheck utility
|
# Test the operator by applying the opcheck utility
|
||||||
opcheck(torch.ops._C.apply_repetition_penalties_,
|
opcheck(torch.ops._C.apply_repetition_penalties_,
|
||||||
(logits.clone(), prompt_mask, output_mask, repetition_penalties))
|
(logits.clone(), prompt_mask, output_mask, repetition_penalties))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(not current_platform.is_cuda(),
|
||||||
|
reason="This test for checking CUDA kernel")
|
||||||
|
@torch.inference_mode()
|
||||||
|
def test_apply_repetition_penalties_zero_seqs() -> None:
|
||||||
|
"""
|
||||||
|
Test the apply_repetition_penalties custom op with num_seqs=0
|
||||||
|
against a reference implementation.
|
||||||
|
"""
|
||||||
|
num_seqs = 0
|
||||||
|
vocab_size = 17
|
||||||
|
repetition_penalty = 1.05
|
||||||
|
dtype = torch.float32
|
||||||
|
seed = 0
|
||||||
|
|
||||||
|
current_platform.seed_everything(seed)
|
||||||
|
torch.set_default_device("cuda:0")
|
||||||
|
|
||||||
|
# Create test data
|
||||||
|
logits = torch.randn(num_seqs, vocab_size, dtype=dtype)
|
||||||
|
|
||||||
|
# Create masks with some random tokens marked as repeated
|
||||||
|
prompt_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
|
||||||
|
output_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
|
||||||
|
|
||||||
|
# No tokens to mark as repeated since num_seqs=0
|
||||||
|
|
||||||
|
# Create repetition penalties tensor
|
||||||
|
repetition_penalties = torch.full((num_seqs, ),
|
||||||
|
repetition_penalty,
|
||||||
|
dtype=dtype)
|
||||||
|
|
||||||
|
# Run all three implementations
|
||||||
|
logits_torch = logits.clone()
|
||||||
|
logits_cuda = logits.clone()
|
||||||
|
|
||||||
|
apply_repetition_penalties_torch(logits_torch, prompt_mask, output_mask,
|
||||||
|
repetition_penalties)
|
||||||
|
apply_repetition_penalties_cuda(logits_cuda, prompt_mask, output_mask,
|
||||||
|
repetition_penalties)
|
||||||
|
|
||||||
|
# Compare all outputs to reference
|
||||||
|
torch.testing.assert_close(logits_torch, logits_cuda, rtol=1e-3, atol=1e-3)
|
||||||
|
|
||||||
|
# Test the operator by applying the opcheck utility
|
||||||
|
opcheck(torch.ops._C.apply_repetition_penalties_,
|
||||||
|
(logits.clone(), prompt_mask, output_mask, repetition_penalties))
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""Integration tests for FlexAttention backend vs default backend"""
|
"""Integration tests for FlexAttention backend vs default backend"""
|
||||||
|
|
||||||
import random
|
import random
|
||||||
|
|||||||
@@ -1094,6 +1094,8 @@ def torch_experts(
|
|||||||
if expert_map is not None:
|
if expert_map is not None:
|
||||||
topk_ids = expert_map[topk_ids]
|
topk_ids = expert_map[topk_ids]
|
||||||
|
|
||||||
|
f32 = torch.float32
|
||||||
|
|
||||||
for i in range(num_experts):
|
for i in range(num_experts):
|
||||||
mask = topk_ids == i
|
mask = topk_ids == i
|
||||||
if mask.sum():
|
if mask.sum():
|
||||||
@@ -1109,7 +1111,8 @@ def torch_experts(
|
|||||||
out.dtype)
|
out.dtype)
|
||||||
tmp2 = SiluAndMul()(tmp1)
|
tmp2 = SiluAndMul()(tmp1)
|
||||||
tmp2, b_scale = moe_kernel_quantize_input(
|
tmp2, b_scale = moe_kernel_quantize_input(
|
||||||
tmp2, None, quant_dtype, per_act_token_quant, block_shape)
|
tmp2, a2_scale, quant_dtype, per_act_token_quant,
|
||||||
|
block_shape)
|
||||||
|
|
||||||
out[mask] = native_w8a8_block_matmul(tmp2, w2[i], b_scale,
|
out[mask] = native_w8a8_block_matmul(tmp2, w2[i], b_scale,
|
||||||
w2_scale[i], block_shape,
|
w2_scale[i], block_shape,
|
||||||
@@ -1117,7 +1120,6 @@ def torch_experts(
|
|||||||
else:
|
else:
|
||||||
assert (a_scale is not None and w1_scale is not None
|
assert (a_scale is not None and w1_scale is not None
|
||||||
and w2_scale is not None)
|
and w2_scale is not None)
|
||||||
f32 = torch.float32
|
|
||||||
scales = a_scale if a_scale.numel() == 1 else a_scale[mask]
|
scales = a_scale if a_scale.numel() == 1 else a_scale[mask]
|
||||||
tmp1 = a[mask].to(f32) * scales
|
tmp1 = a[mask].to(f32) * scales
|
||||||
w1_dq = (w1[i].to(f32) * w1_scale[i]).transpose(0, 1)
|
w1_dq = (w1[i].to(f32) * w1_scale[i]).transpose(0, 1)
|
||||||
@@ -1126,8 +1128,8 @@ def torch_experts(
|
|||||||
w2_dq = (w2[i].to(f32) * w2_scale[i]).transpose(0, 1)
|
w2_dq = (w2[i].to(f32) * w2_scale[i]).transpose(0, 1)
|
||||||
out[mask] = (tmp2 @ w2_dq).to(out.dtype)
|
out[mask] = (tmp2 @ w2_dq).to(out.dtype)
|
||||||
|
|
||||||
return (out.view(M, -1, w2.shape[1]) *
|
return (out.view(M, -1, w2.shape[1]).to(f32) *
|
||||||
topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1)
|
topk_weight.view(M, -1, 1)).sum(dim=1).to(out.dtype)
|
||||||
|
|
||||||
|
|
||||||
def torch_moe(a: torch.Tensor,
|
def torch_moe(a: torch.Tensor,
|
||||||
|
|||||||
@@ -249,23 +249,6 @@ def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
|
|||||||
model_runner.model)
|
model_runner.model)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(params=[True, False])
|
|
||||||
def run_with_both_engines_lora(request, monkeypatch):
|
|
||||||
# Automatically runs tests twice, once with V1 and once without
|
|
||||||
use_v1 = request.param
|
|
||||||
# Tests decorated with `@skip_v1` are only run without v1
|
|
||||||
skip_v1 = request.node.get_closest_marker("skip_v1")
|
|
||||||
|
|
||||||
if use_v1:
|
|
||||||
if skip_v1:
|
|
||||||
pytest.skip("Skipping test on vllm V1")
|
|
||||||
monkeypatch.setenv('VLLM_USE_V1', '1')
|
|
||||||
else:
|
|
||||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
|
||||||
|
|
||||||
yield
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def reset_default_device():
|
def reset_default_device():
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from tests.models.registry import HF_EXAMPLE_MODELS
|
||||||
from tests.utils import multi_gpu_test
|
from tests.utils import multi_gpu_test
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
@@ -19,31 +20,55 @@ pytestmark = pytest.mark.hybrid_model
|
|||||||
SSM_MODELS = [
|
SSM_MODELS = [
|
||||||
"state-spaces/mamba-130m-hf",
|
"state-spaces/mamba-130m-hf",
|
||||||
"tiiuae/falcon-mamba-tiny-dev",
|
"tiiuae/falcon-mamba-tiny-dev",
|
||||||
# TODO: Compare to a Mamba2 model. The HF transformers implementation of
|
|
||||||
# Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
|
|
||||||
# doesn't compare vLLM output with HF output.
|
|
||||||
# See https://github.com/huggingface/transformers/pull/35943
|
|
||||||
"mistralai/Mamba-Codestral-7B-v0.1",
|
"mistralai/Mamba-Codestral-7B-v0.1",
|
||||||
]
|
]
|
||||||
|
|
||||||
HYBRID_MODELS = [
|
HYBRID_MODELS = [
|
||||||
"ai21labs/Jamba-tiny-dev",
|
"ai21labs/Jamba-tiny-dev",
|
||||||
# NOTE: Currently the test failes due to HF transformers issue fixed in:
|
|
||||||
# https://github.com/huggingface/transformers/pull/39033
|
|
||||||
# We will enable vLLM test for Granite after next HF transformers release.
|
|
||||||
# "ibm-granite/granite-4.0-tiny-preview",
|
|
||||||
# NOTE: Running Plamo2 in transformers implementation requires to install
|
# NOTE: Running Plamo2 in transformers implementation requires to install
|
||||||
# causal-conv1d package, which is not listed as a test dependency as it's
|
# causal-conv1d package, which is not listed as a test dependency as it's
|
||||||
# not compatible with pip-compile.
|
# not compatible with pip-compile.
|
||||||
"pfnet/plamo-2-1b",
|
"pfnet/plamo-2-1b",
|
||||||
"Zyphra/Zamba2-1.2B-instruct",
|
"Zyphra/Zamba2-1.2B-instruct",
|
||||||
"hmellor/tiny-random-BambaForCausalLM",
|
"hmellor/tiny-random-BambaForCausalLM",
|
||||||
|
"ibm-ai-platform/Bamba-9B-v1",
|
||||||
|
"nvidia/Nemotron-H-8B-Base-8K",
|
||||||
|
"ibm-granite/granite-4.0-tiny-preview",
|
||||||
|
"tiiuae/Falcon-H1-0.5B-Base",
|
||||||
|
]
|
||||||
|
|
||||||
|
HF_UNSUPPORTED_MODELS = [
|
||||||
|
# The HF transformers implementation of
|
||||||
|
# Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
|
||||||
|
# doesn't compare vLLM output with HF output.
|
||||||
|
# See https://github.com/huggingface/transformers/pull/35943
|
||||||
|
"mistralai/Mamba-Codestral-7B-v0.1",
|
||||||
|
# Note: I'm not seeing the same output from vLLM V0 vs. HF transformers
|
||||||
|
# for Nemotron-H-8B; currently only compare vLLM V0 vs. vLLM V1
|
||||||
|
"nvidia/Nemotron-H-8B-Base-8K",
|
||||||
|
# NOTE: Currently the test fails due to HF transformers issue fixed in:
|
||||||
|
# https://github.com/huggingface/transformers/pull/39033
|
||||||
|
# We will enable vLLM test for Granite after next HF transformers release.
|
||||||
|
"ibm-granite/granite-4.0-tiny-preview",
|
||||||
]
|
]
|
||||||
|
|
||||||
V1_SUPPORTED_MODELS = [
|
V1_SUPPORTED_MODELS = [
|
||||||
"mistralai/Mamba-Codestral-7B-v0.1",
|
"mistralai/Mamba-Codestral-7B-v0.1",
|
||||||
|
"ibm-ai-platform/Bamba-9B-v1",
|
||||||
|
"Zyphra/Zamba2-1.2B-instruct",
|
||||||
|
"nvidia/Nemotron-H-8B-Base-8K",
|
||||||
|
"ibm-granite/granite-4.0-tiny-preview",
|
||||||
|
"tiiuae/Falcon-H1-0.5B-Base",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
ATTN_BLOCK_SIZES = {
|
||||||
|
"ibm-ai-platform/Bamba-9B-v1": 528,
|
||||||
|
"Zyphra/Zamba2-1.2B-instruct": 80,
|
||||||
|
"nvidia/Nemotron-H-8B-Base-8K": 528,
|
||||||
|
"ibm-granite/granite-4.0-tiny-preview": 400,
|
||||||
|
"tiiuae/Falcon-H1-0.5B-Base": 800,
|
||||||
|
}
|
||||||
|
|
||||||
# Avoid OOM
|
# Avoid OOM
|
||||||
MAX_NUM_SEQS = 4
|
MAX_NUM_SEQS = 4
|
||||||
|
|
||||||
@@ -60,8 +85,16 @@ def test_models(
|
|||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
num_logprobs: int,
|
num_logprobs: int,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
|
try:
|
||||||
|
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||||
|
model_info.check_available_online(on_fail="skip")
|
||||||
|
model_info.check_transformers_version(on_fail="skip")
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
with hf_runner(model) as hf_model:
|
with hf_runner(model) as hf_model:
|
||||||
if model != "mistralai/Mamba-Codestral-7B-v0.1":
|
if model not in HF_UNSUPPORTED_MODELS:
|
||||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||||
example_prompts, max_tokens, num_logprobs)
|
example_prompts, max_tokens, num_logprobs)
|
||||||
else:
|
else:
|
||||||
@@ -72,12 +105,21 @@ def test_models(
|
|||||||
example_prompts, max_tokens, num_logprobs)
|
example_prompts, max_tokens, num_logprobs)
|
||||||
|
|
||||||
if model in V1_SUPPORTED_MODELS:
|
if model in V1_SUPPORTED_MODELS:
|
||||||
|
if model in HYBRID_MODELS and model in ATTN_BLOCK_SIZES:
|
||||||
|
block_size = ATTN_BLOCK_SIZES[model]
|
||||||
|
else:
|
||||||
|
block_size = 16
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
m.setenv("VLLM_USE_V1", "1")
|
||||||
|
if model in HYBRID_MODELS:
|
||||||
|
# required due to reorder_batch behaviour
|
||||||
|
m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
|
||||||
with vllm_runner(model,
|
with vllm_runner(model,
|
||||||
max_num_seqs=MAX_NUM_SEQS,
|
max_num_seqs=MAX_NUM_SEQS,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
enable_prefix_caching=False) as vllm_model:
|
enable_prefix_caching=False,
|
||||||
|
block_size=block_size) as vllm_model:
|
||||||
vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
|
vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
|
||||||
example_prompts, max_tokens, num_logprobs)
|
example_prompts, max_tokens, num_logprobs)
|
||||||
else:
|
else:
|
||||||
@@ -111,6 +153,14 @@ def test_batching(
|
|||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
num_logprobs: int,
|
num_logprobs: int,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
|
try:
|
||||||
|
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||||
|
model_info.check_available_online(on_fail="skip")
|
||||||
|
model_info.check_transformers_version(on_fail="skip")
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
for_loop_outputs = []
|
for_loop_outputs = []
|
||||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||||
for prompt in example_prompts:
|
for prompt in example_prompts:
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import os
|
import os
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@@ -74,6 +75,13 @@ def test_models(
|
|||||||
vllm_extra_kwargs["override_pooler_config"] = \
|
vllm_extra_kwargs["override_pooler_config"] = \
|
||||||
PoolerConfig(pooling_type="MEAN", normalize=False)
|
PoolerConfig(pooling_type="MEAN", normalize=False)
|
||||||
|
|
||||||
|
max_model_len: Optional[int] = 512
|
||||||
|
if model in [
|
||||||
|
"sentence-transformers/all-MiniLM-L12-v2",
|
||||||
|
"sentence-transformers/stsb-roberta-base-v2"
|
||||||
|
]:
|
||||||
|
max_model_len = None
|
||||||
|
|
||||||
# The example_prompts has ending "\n", for example:
|
# The example_prompts has ending "\n", for example:
|
||||||
# "Write a short story about a robot that dreams for the first time.\n"
|
# "Write a short story about a robot that dreams for the first time.\n"
|
||||||
# sentence_transformers will strip the input texts, see:
|
# sentence_transformers will strip the input texts, see:
|
||||||
@@ -87,7 +95,7 @@ def test_models(
|
|||||||
|
|
||||||
with vllm_runner(model,
|
with vllm_runner(model,
|
||||||
task="embed",
|
task="embed",
|
||||||
max_model_len=512,
|
max_model_len=max_model_len,
|
||||||
**vllm_extra_kwargs) as vllm_model:
|
**vllm_extra_kwargs) as vllm_model:
|
||||||
vllm_outputs = vllm_model.embed(example_prompts)
|
vllm_outputs = vllm_model.embed(example_prompts)
|
||||||
|
|
||||||
|
|||||||
@@ -56,10 +56,16 @@ MODELS = [
|
|||||||
enable_test=False),
|
enable_test=False),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
V1FlashAttentionImpNotSupported = [
|
||||||
|
"Alibaba-NLP/gte-Qwen2-1.5B-instruct", "Alibaba-NLP/gte-modernbert-base"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_info", MODELS)
|
@pytest.mark.parametrize("model_info", MODELS)
|
||||||
def test_embed_models_mteb(hf_runner, vllm_runner,
|
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo,
|
||||||
model_info: EmbedModelInfo) -> None:
|
monkeypatch) -> None:
|
||||||
|
if model_info.name in V1FlashAttentionImpNotSupported:
|
||||||
|
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||||
|
|
||||||
vllm_extra_kwargs: dict[str, Any] = {}
|
vllm_extra_kwargs: dict[str, Any] = {}
|
||||||
if model_info.architecture == "GteNewModel":
|
if model_info.architecture == "GteNewModel":
|
||||||
@@ -71,8 +77,10 @@ def test_embed_models_mteb(hf_runner, vllm_runner,
|
|||||||
|
|
||||||
@pytest.mark.parametrize("model_info", MODELS)
|
@pytest.mark.parametrize("model_info", MODELS)
|
||||||
def test_embed_models_correctness(hf_runner, vllm_runner,
|
def test_embed_models_correctness(hf_runner, vllm_runner,
|
||||||
model_info: EmbedModelInfo,
|
model_info: EmbedModelInfo, example_prompts,
|
||||||
example_prompts) -> None:
|
monkeypatch) -> None:
|
||||||
|
if model_info.name in V1FlashAttentionImpNotSupported:
|
||||||
|
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||||
|
|
||||||
vllm_extra_kwargs: dict[str, Any] = {}
|
vllm_extra_kwargs: dict[str, Any] = {}
|
||||||
if model_info.architecture == "GteNewModel":
|
if model_info.architecture == "GteNewModel":
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from ...utils import EmbedModelInfo
|
from ...utils import EmbedModelInfo
|
||||||
|
|||||||
84
tests/models/language/pooling/test_mxbai_rerank.py
Normal file
84
tests/models/language/pooling/test_mxbai_rerank.py
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from tests.conftest import HfRunner
|
||||||
|
|
||||||
|
from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
|
||||||
|
|
||||||
|
RERANK_MODELS = [
|
||||||
|
RerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
|
||||||
|
architecture="Qwen2ForSequenceClassification",
|
||||||
|
dtype="float32",
|
||||||
|
enable_test=True),
|
||||||
|
RerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
|
||||||
|
architecture="Qwen2ForSequenceClassification",
|
||||||
|
dtype="float32",
|
||||||
|
enable_test=False)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class MxbaiRerankerHfRunner(HfRunner):
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
model_name: str,
|
||||||
|
dtype: str = "auto",
|
||||||
|
*args: Any,
|
||||||
|
**kwargs: Any) -> None:
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
|
||||||
|
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||||
|
padding_side='left')
|
||||||
|
self.yes_loc = self.tokenizer.convert_tokens_to_ids("1")
|
||||||
|
self.no_loc = self.tokenizer.convert_tokens_to_ids("0")
|
||||||
|
|
||||||
|
def predict(self, prompts: list[list[str]], *args,
|
||||||
|
**kwargs) -> torch.Tensor:
|
||||||
|
|
||||||
|
def process_inputs(pairs):
|
||||||
|
inputs = self.tokenizer(pairs,
|
||||||
|
padding=False,
|
||||||
|
truncation='longest_first',
|
||||||
|
return_attention_mask=False)
|
||||||
|
for i, ele in enumerate(inputs['input_ids']):
|
||||||
|
inputs['input_ids'][i] = ele
|
||||||
|
inputs = self.tokenizer.pad(inputs,
|
||||||
|
padding=True,
|
||||||
|
return_tensors="pt")
|
||||||
|
for key in inputs:
|
||||||
|
inputs[key] = inputs[key].to(self.model.device)
|
||||||
|
return inputs
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def compute_logits(inputs):
|
||||||
|
logits = self.model(**inputs).logits[:, -1, :]
|
||||||
|
yes_logits = logits[:, self.yes_loc]
|
||||||
|
no_logits = logits[:, self.no_loc]
|
||||||
|
logits = yes_logits - no_logits
|
||||||
|
scores = logits.float().sigmoid()
|
||||||
|
return scores
|
||||||
|
|
||||||
|
scores = []
|
||||||
|
for prompt in prompts:
|
||||||
|
inputs = process_inputs([prompt])
|
||||||
|
score = compute_logits(inputs)
|
||||||
|
scores.append(score[0].item())
|
||||||
|
return torch.Tensor(scores)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||||
|
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
|
||||||
|
vllm_extra_kwargs: dict[str, Any] = {}
|
||||||
|
if model_info.architecture == "Qwen2ForSequenceClassification":
|
||||||
|
vllm_extra_kwargs["hf_overrides"] = {
|
||||||
|
"architectures": ["Qwen2ForSequenceClassification"],
|
||||||
|
"classifier_from_token": ["0", "1"],
|
||||||
|
"method": "from_2_way_softmax",
|
||||||
|
}
|
||||||
|
|
||||||
|
mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info,
|
||||||
|
vllm_extra_kwargs)
|
||||||
@@ -33,9 +33,6 @@ if current_platform.is_rocm():
|
|||||||
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
|
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
|
||||||
|
|
||||||
REQUIRES_V0_MODELS = [
|
REQUIRES_V0_MODELS = [
|
||||||
# V1 Test: no way to fall back for head_dim = 80
|
|
||||||
# https://github.com/vllm-project/vllm/issues/14524
|
|
||||||
"qwen_vl",
|
|
||||||
# V1 Test: not enough KV cache space in C1.
|
# V1 Test: not enough KV cache space in C1.
|
||||||
"fuyu",
|
"fuyu",
|
||||||
]
|
]
|
||||||
@@ -221,8 +218,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
marks=[large_gpu_mark(min_gb=32)],
|
marks=[large_gpu_mark(min_gb=32)],
|
||||||
),
|
),
|
||||||
"blip2": VLMTestInfo(
|
"blip2": VLMTestInfo(
|
||||||
# TODO: Change back to 2.7b once head_dim = 80 is supported
|
models=["Salesforce/blip2-opt-2.7b"],
|
||||||
models=["Salesforce/blip2-opt-6.7b"],
|
|
||||||
test_type=VLMTestType.IMAGE,
|
test_type=VLMTestType.IMAGE,
|
||||||
prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
|
prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
|
||||||
img_idx_to_prompt=lambda idx: "",
|
img_idx_to_prompt=lambda idx: "",
|
||||||
@@ -340,8 +336,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
"h2ovl": VLMTestInfo(
|
"h2ovl": VLMTestInfo(
|
||||||
models = [
|
models = [
|
||||||
"h2oai/h2ovl-mississippi-800m",
|
"h2oai/h2ovl-mississippi-800m",
|
||||||
# TODO: Re-enable once head_dim = 80 is supported
|
"h2oai/h2ovl-mississippi-2b",
|
||||||
# "h2oai/h2ovl-mississippi-2b",
|
|
||||||
],
|
],
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
|
||||||
|
|||||||
@@ -83,7 +83,7 @@ MODELS = [
|
|||||||
QWEN2_CONFIG,
|
QWEN2_CONFIG,
|
||||||
PHI3_CONFIG,
|
PHI3_CONFIG,
|
||||||
GPT2_CONFIG,
|
GPT2_CONFIG,
|
||||||
# STABLELM_CONFIG, # enable this when v1 support head_size=80
|
STABLELM_CONFIG,
|
||||||
DOLPHIN_CONFIG,
|
DOLPHIN_CONFIG,
|
||||||
# STARCODER_CONFIG, # broken
|
# STARCODER_CONFIG, # broken
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -169,7 +169,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
|||||||
"ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"), # noqa: E501
|
"ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"), # noqa: E501
|
||||||
"Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), # noqa: E501
|
"Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), # noqa: E501
|
||||||
"FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
|
"FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
|
||||||
"FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-1.5B-Instruct",
|
"FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base",
|
||||||
min_transformers_version="4.53"),
|
min_transformers_version="4.53"),
|
||||||
"GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
|
"GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
|
||||||
"Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
|
"Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
|
||||||
@@ -240,8 +240,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
|||||||
"OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat",
|
"OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat",
|
||||||
trust_remote_code=True),
|
trust_remote_code=True),
|
||||||
"PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"),
|
"PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"),
|
||||||
"PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2", v0_only=True),
|
"PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"),
|
||||||
"Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"),
|
"Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"),
|
||||||
|
# Blocksparse attention not supported in V1 yet
|
||||||
"Phi3SmallForCausalLM": _HfExamplesInfo("microsoft/Phi-3-small-8k-instruct",
|
"Phi3SmallForCausalLM": _HfExamplesInfo("microsoft/Phi-3-small-8k-instruct",
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
v0_only=True),
|
v0_only=True),
|
||||||
@@ -258,10 +259,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
|||||||
"Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
|
"Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
|
||||||
"Qwen3ForSequenceClassification": _HfExamplesInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls"), # noqa: E501
|
"Qwen3ForSequenceClassification": _HfExamplesInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls"), # noqa: E501
|
||||||
"RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
|
"RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
|
||||||
"StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b", # noqa: E501
|
"StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"), # noqa: E501
|
||||||
v0_only=True),
|
"StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
|
||||||
"StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t",
|
|
||||||
v0_only=True),
|
|
||||||
"Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
|
"Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
|
||||||
"SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
|
"SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
|
||||||
"TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
|
"TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
|
||||||
@@ -330,8 +329,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
|||||||
"AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
|
"AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
|
||||||
"AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), # noqa: E501
|
"AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), # noqa: E501
|
||||||
"Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b", # noqa: E501
|
"Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b", # noqa: E501
|
||||||
extras={"6b": "Salesforce/blip2-opt-6.7b"}, # noqa: E501
|
extras={"6b": "Salesforce/blip2-opt-6.7b"}), # noqa: E501
|
||||||
v0_only=True),
|
|
||||||
"ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501
|
"ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501
|
||||||
"DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny", # noqa: E501
|
"DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny", # noqa: E501
|
||||||
extras={"fork": "Isotr0py/deepseek-vl2-tiny"}, # noqa: E501
|
extras={"fork": "Isotr0py/deepseek-vl2-tiny"}, # noqa: E501
|
||||||
@@ -359,8 +357,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
|||||||
trust_remote_code=True),
|
trust_remote_code=True),
|
||||||
"KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501
|
"KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501
|
||||||
extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"}, # noqa: E501
|
extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"}, # noqa: E501
|
||||||
trust_remote_code=True,
|
trust_remote_code=True),
|
||||||
v0_only=True),
|
|
||||||
"Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
|
"Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
|
||||||
max_model_len=10240),
|
max_model_len=10240),
|
||||||
"LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
|
"LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
|
||||||
|
|||||||
@@ -22,7 +22,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
|
|||||||
model_info.check_transformers_version(on_fail="skip")
|
model_info.check_transformers_version(on_fail="skip")
|
||||||
|
|
||||||
# FIXME: Possible memory leak in the previous tests?
|
# FIXME: Possible memory leak in the previous tests?
|
||||||
if model_arch == "GraniteSpeechForConditionalGeneration":
|
if model_arch in ("GraniteSpeechForConditionalGeneration",
|
||||||
|
"KimiVLForConditionalGeneration"):
|
||||||
pytest.skip("Avoid OOM")
|
pytest.skip("Avoid OOM")
|
||||||
|
|
||||||
# Avoid OOM and reduce initialization time by only using 1 layer
|
# Avoid OOM and reduce initialization time by only using 1 layer
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
# Copyright © 2025, Oracle and/or its affiliates.
|
# Copyright © 2025, Oracle and/or its affiliates.
|
||||||
"""Tests RTN quantization startup and generation,
|
"""Tests RTN quantization startup and generation,
|
||||||
doesn't test correctness
|
doesn't test correctness
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
# ruff: noqa: E501
|
# ruff: noqa: E501
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ import torch
|
|||||||
from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
|
from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
|
||||||
SchedulerConfig, SpeculativeConfig, VllmConfig)
|
SchedulerConfig, SpeculativeConfig, VllmConfig)
|
||||||
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
|
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
|
||||||
from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
|
from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
|
||||||
from vllm.v1.core.sched.scheduler import Scheduler
|
from vllm.v1.core.sched.scheduler import Scheduler
|
||||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||||
@@ -17,6 +17,7 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
|||||||
from vllm.v1.outputs import ModelRunnerOutput
|
from vllm.v1.outputs import ModelRunnerOutput
|
||||||
from vllm.v1.request import Request, RequestStatus
|
from vllm.v1.request import Request, RequestStatus
|
||||||
from vllm.v1.structured_output import StructuredOutputManager
|
from vllm.v1.structured_output import StructuredOutputManager
|
||||||
|
from vllm.v1.structured_output.request import StructuredOutputRequest
|
||||||
|
|
||||||
EOS_TOKEN_ID = 50256
|
EOS_TOKEN_ID = 50256
|
||||||
|
|
||||||
@@ -33,6 +34,7 @@ def create_scheduler(
|
|||||||
block_size: int = 16,
|
block_size: int = 16,
|
||||||
max_model_len: Optional[int] = None,
|
max_model_len: Optional[int] = None,
|
||||||
num_speculative_tokens: Optional[int] = None,
|
num_speculative_tokens: Optional[int] = None,
|
||||||
|
skip_tokenizer_init: bool = False,
|
||||||
) -> Scheduler:
|
) -> Scheduler:
|
||||||
'''Create scheduler under test.
|
'''Create scheduler under test.
|
||||||
|
|
||||||
@@ -65,6 +67,7 @@ def create_scheduler(
|
|||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
dtype="float16",
|
dtype="float16",
|
||||||
seed=42,
|
seed=42,
|
||||||
|
skip_tokenizer_init=skip_tokenizer_init,
|
||||||
)
|
)
|
||||||
# Cache config, optionally force APC
|
# Cache config, optionally force APC
|
||||||
kwargs_cache = ({} if enable_prefix_caching is None else {
|
kwargs_cache = ({} if enable_prefix_caching is None else {
|
||||||
@@ -186,7 +189,7 @@ def test_get_num_unfinished_requests():
|
|||||||
])
|
])
|
||||||
def test_schedule(enable_prefix_caching: Optional[bool],
|
def test_schedule(enable_prefix_caching: Optional[bool],
|
||||||
prompt_logprobs: Optional[int]):
|
prompt_logprobs: Optional[int]):
|
||||||
'''Test scheduling.
|
'''Test scheduling.
|
||||||
Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
|
Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
|
||||||
'''
|
'''
|
||||||
scheduler = create_scheduler(enable_prefix_caching=enable_prefix_caching)
|
scheduler = create_scheduler(enable_prefix_caching=enable_prefix_caching)
|
||||||
@@ -1408,7 +1411,7 @@ def create_requests_with_priority(
|
|||||||
|
|
||||||
|
|
||||||
def test_priority_scheduling_basic_ordering():
|
def test_priority_scheduling_basic_ordering():
|
||||||
"""Test that requests are scheduled in priority order
|
"""Test that requests are scheduled in priority order
|
||||||
(lower value = higher priority)."""
|
(lower value = higher priority)."""
|
||||||
scheduler = create_scheduler_with_priority()
|
scheduler = create_scheduler_with_priority()
|
||||||
|
|
||||||
@@ -1437,7 +1440,7 @@ def test_priority_scheduling_basic_ordering():
|
|||||||
|
|
||||||
|
|
||||||
def test_priority_scheduling_arrival_time_tiebreaker():
|
def test_priority_scheduling_arrival_time_tiebreaker():
|
||||||
"""Test that arrival time is used
|
"""Test that arrival time is used
|
||||||
as tiebreaker when priorities are equal."""
|
as tiebreaker when priorities are equal."""
|
||||||
scheduler = create_scheduler_with_priority()
|
scheduler = create_scheduler_with_priority()
|
||||||
|
|
||||||
@@ -1495,7 +1498,7 @@ def test_priority_scheduling_mixed_priority_and_arrival():
|
|||||||
|
|
||||||
|
|
||||||
def test_priority_scheduling_preemption():
|
def test_priority_scheduling_preemption():
|
||||||
"""Test that priority scheduling preempts
|
"""Test that priority scheduling preempts
|
||||||
lower priority requests when memory is constrained."""
|
lower priority requests when memory is constrained."""
|
||||||
# Create scheduler with very limited memory to force preemption
|
# Create scheduler with very limited memory to force preemption
|
||||||
scheduler = create_scheduler_with_priority(
|
scheduler = create_scheduler_with_priority(
|
||||||
@@ -1576,7 +1579,7 @@ def test_priority_scheduling_preemption():
|
|||||||
|
|
||||||
|
|
||||||
def test_priority_scheduling_no_preemption_when_space_available():
|
def test_priority_scheduling_no_preemption_when_space_available():
|
||||||
"""Test that preemption doesn't happen
|
"""Test that preemption doesn't happen
|
||||||
when there's space for new requests."""
|
when there's space for new requests."""
|
||||||
scheduler = create_scheduler_with_priority(
|
scheduler = create_scheduler_with_priority(
|
||||||
max_num_seqs=3, # Allow 3 concurrent requests
|
max_num_seqs=3, # Allow 3 concurrent requests
|
||||||
@@ -1626,7 +1629,7 @@ def test_priority_scheduling_no_preemption_when_space_available():
|
|||||||
|
|
||||||
|
|
||||||
def test_priority_scheduling_preemption_victim_selection():
|
def test_priority_scheduling_preemption_victim_selection():
|
||||||
"""Test that the correct victim is selected for
|
"""Test that the correct victim is selected for
|
||||||
preemption based on priority and arrival time."""
|
preemption based on priority and arrival time."""
|
||||||
# This test verifies the priority-based victim selection logic
|
# This test verifies the priority-based victim selection logic
|
||||||
# by checking the waiting queue order after adding requests with different
|
# by checking the waiting queue order after adding requests with different
|
||||||
@@ -1743,7 +1746,7 @@ def test_priority_scheduling_waiting_queue_order():
|
|||||||
|
|
||||||
|
|
||||||
def test_priority_scheduling_fcfs_fallback():
|
def test_priority_scheduling_fcfs_fallback():
|
||||||
"""Test that FCFS behavior is maintained when all
|
"""Test that FCFS behavior is maintained when all
|
||||||
requests have same priority."""
|
requests have same priority."""
|
||||||
scheduler = create_scheduler_with_priority()
|
scheduler = create_scheduler_with_priority()
|
||||||
|
|
||||||
@@ -1811,7 +1814,7 @@ def test_priority_scheduling_with_limited_slots():
|
|||||||
|
|
||||||
|
|
||||||
def test_priority_scheduling_heap_property():
|
def test_priority_scheduling_heap_property():
|
||||||
"""Test that the waiting queue maintains heap
|
"""Test that the waiting queue maintains heap
|
||||||
property for priority scheduling."""
|
property for priority scheduling."""
|
||||||
scheduler = create_scheduler_with_priority(
|
scheduler = create_scheduler_with_priority(
|
||||||
max_num_seqs=1, # Only one request can run at a time
|
max_num_seqs=1, # Only one request can run at a time
|
||||||
@@ -1857,3 +1860,39 @@ def test_priority_scheduling_heap_property():
|
|||||||
# Verify requests were scheduled in priority order (lowest value first)
|
# Verify requests were scheduled in priority order (lowest value first)
|
||||||
expected_priorities = sorted(priorities)
|
expected_priorities = sorted(priorities)
|
||||||
assert scheduled_priorities == expected_priorities
|
assert scheduled_priorities == expected_priorities
|
||||||
|
|
||||||
|
|
||||||
|
def test_schedule_skip_tokenizer_init():
|
||||||
|
scheduler = create_scheduler(skip_tokenizer_init=True)
|
||||||
|
requests = create_requests(num_requests=5)
|
||||||
|
for request in requests:
|
||||||
|
scheduler.add_request(request)
|
||||||
|
output = scheduler.schedule()
|
||||||
|
assert len(output.scheduled_new_reqs) == len(requests)
|
||||||
|
assert output.grammar_bitmask is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_schedule_skip_tokenizer_init_structured_output_request():
|
||||||
|
scheduler = create_scheduler(skip_tokenizer_init=True)
|
||||||
|
guided_params = GuidedDecodingParams(regex="[0-9]+")
|
||||||
|
sampling_params = SamplingParams(
|
||||||
|
ignore_eos=False,
|
||||||
|
max_tokens=16,
|
||||||
|
guided_decoding=guided_params,
|
||||||
|
)
|
||||||
|
request = Request(
|
||||||
|
request_id="0",
|
||||||
|
prompt_token_ids=[0, 1],
|
||||||
|
multi_modal_inputs=None,
|
||||||
|
multi_modal_hashes=None,
|
||||||
|
multi_modal_placeholders=None,
|
||||||
|
sampling_params=sampling_params,
|
||||||
|
pooling_params=None,
|
||||||
|
eos_token_id=EOS_TOKEN_ID,
|
||||||
|
structured_output_request=StructuredOutputRequest(sampling_params),
|
||||||
|
)
|
||||||
|
scheduler.add_request(request)
|
||||||
|
output = scheduler.schedule()
|
||||||
|
assert len(output.scheduled_new_reqs) == 0
|
||||||
|
assert len(scheduler.running) == 0
|
||||||
|
assert len(scheduler.waiting) == 1
|
||||||
|
|||||||
@@ -1,19 +1,30 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import random
|
import random
|
||||||
from typing import Optional
|
from typing import TYPE_CHECKING, Optional
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM
|
||||||
|
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
|
||||||
from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Metric, Vector
|
from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Metric, Vector
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from tests.conftest import VllmRunner
|
||||||
|
|
||||||
MODEL = "facebook/opt-125m"
|
MODEL = "facebook/opt-125m"
|
||||||
DTYPE = "half"
|
DTYPE = "half"
|
||||||
|
|
||||||
|
|
||||||
def _vllm_model(apc: bool, vllm_runner, monkeypatch):
|
def _vllm_model(
|
||||||
|
apc: bool,
|
||||||
|
vllm_runner: type[VllmRunner],
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
*,
|
||||||
|
skip_tokenizer_init: bool = False,
|
||||||
|
):
|
||||||
"""Set up VllmRunner instance."""
|
"""Set up VllmRunner instance."""
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||||
return vllm_runner(
|
return vllm_runner(
|
||||||
@@ -23,6 +34,7 @@ def _vllm_model(apc: bool, vllm_runner, monkeypatch):
|
|||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
enable_prefix_caching=apc,
|
enable_prefix_caching=apc,
|
||||||
gpu_memory_utilization=0.5,
|
gpu_memory_utilization=0.5,
|
||||||
|
skip_tokenizer_init=skip_tokenizer_init,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -45,9 +57,27 @@ def vllm_model_apc(vllm_runner, monkeypatch):
|
|||||||
yield vllm_model
|
yield vllm_model
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(
|
||||||
|
# Function scope decouples tests & allows
|
||||||
|
# env var adjustment via monkeypatch
|
||||||
|
scope="function",
|
||||||
|
# Prefix caching
|
||||||
|
params=[False, True])
|
||||||
|
def vllm_model_skip_tokenizer_init(vllm_runner, request, monkeypatch):
|
||||||
|
"""VllmRunner test fixture with APC."""
|
||||||
|
with _vllm_model(
|
||||||
|
request.param,
|
||||||
|
vllm_runner,
|
||||||
|
monkeypatch,
|
||||||
|
skip_tokenizer_init=True,
|
||||||
|
) as vllm_model:
|
||||||
|
yield vllm_model
|
||||||
|
|
||||||
|
|
||||||
def _get_test_sampling_params(
|
def _get_test_sampling_params(
|
||||||
prompt_list: list[str],
|
prompt_list: list[str],
|
||||||
seed: Optional[int] = 42,
|
seed: Optional[int] = 42,
|
||||||
|
structured_outputs: bool = False,
|
||||||
) -> tuple[list[SamplingParams], list[int]]:
|
) -> tuple[list[SamplingParams], list[int]]:
|
||||||
"""Generate random sampling params for a batch."""
|
"""Generate random sampling params for a batch."""
|
||||||
|
|
||||||
@@ -62,14 +92,34 @@ def _get_test_sampling_params(
|
|||||||
n_list = [get_mostly_n_gt1() for _ in range(len(prompt_list))]
|
n_list = [get_mostly_n_gt1() for _ in range(len(prompt_list))]
|
||||||
# High temperature to maximize the chance of unique completions
|
# High temperature to maximize the chance of unique completions
|
||||||
return [
|
return [
|
||||||
SamplingParams(temperature=0.95, top_p=0.95, n=n, seed=seed)
|
SamplingParams(
|
||||||
for n in n_list
|
temperature=0.95,
|
||||||
|
top_p=0.95,
|
||||||
|
n=n,
|
||||||
|
seed=seed,
|
||||||
|
guided_decoding=GuidedDecodingParams(
|
||||||
|
regex="[0-9]+") if structured_outputs else None,
|
||||||
|
) for n in n_list
|
||||||
], n_list
|
], n_list
|
||||||
|
|
||||||
|
|
||||||
|
def test_compatibility_with_skip_tokenizer_init(
|
||||||
|
vllm_model_skip_tokenizer_init: VllmRunner,
|
||||||
|
example_prompts: list[str],
|
||||||
|
):
|
||||||
|
# Case 1: Structured output request should raise an error.
|
||||||
|
sampling_params_list, _ = _get_test_sampling_params(
|
||||||
|
example_prompts,
|
||||||
|
structured_outputs=True,
|
||||||
|
)
|
||||||
|
model: LLM = vllm_model_skip_tokenizer_init.model
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
_ = model.generate(example_prompts, sampling_params_list)
|
||||||
|
|
||||||
|
|
||||||
def test_parallel_sampling(vllm_model, example_prompts) -> None:
|
def test_parallel_sampling(vllm_model, example_prompts) -> None:
|
||||||
"""Test passes if parallel sampling `n>1` yields `n` unique completions.
|
"""Test passes if parallel sampling `n>1` yields `n` unique completions.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vllm_model: VllmRunner instance under test.
|
vllm_model: VllmRunner instance under test.
|
||||||
example_prompt: test fixture providing prompts for testing.
|
example_prompt: test fixture providing prompts for testing.
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import random
|
import random
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ UNSUPPORTED_MODELS_V1 = [
|
|||||||
"openai/whisper-large-v3", # transcription
|
"openai/whisper-large-v3", # transcription
|
||||||
"facebook/bart-large-cnn", # encoder decoder
|
"facebook/bart-large-cnn", # encoder decoder
|
||||||
"state-spaces/mamba-130m-hf", # mamba1
|
"state-spaces/mamba-130m-hf", # mamba1
|
||||||
"hmellor/tiny-random-BambaForCausalLM", # hybrid
|
|
||||||
"BAAI/bge-m3", # embedding
|
"BAAI/bge-m3", # embedding
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from vllm.v1.request import RequestStatus
|
from vllm.v1.request import RequestStatus
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import gc
|
import gc
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|||||||
@@ -450,6 +450,7 @@ def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2):
|
|||||||
|
|
||||||
|
|
||||||
def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
|
def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
|
||||||
|
torch.set_default_dtype(torch.float16)
|
||||||
layer_0 = "model.layers.0.self_attn.attn"
|
layer_0 = "model.layers.0.self_attn.attn"
|
||||||
layer_1 = "model.layers.1.self_attn.attn"
|
layer_1 = "model.layers.1.self_attn.attn"
|
||||||
error_msg = f"{layer_1} must come before the current layer"
|
error_msg = f"{layer_1} must come before the current layer"
|
||||||
@@ -478,6 +479,7 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
|
|||||||
|
|
||||||
|
|
||||||
def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
|
def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
|
||||||
|
torch.set_default_dtype(torch.float16)
|
||||||
layer_0 = "model.layers.0.self_attn.attn"
|
layer_0 = "model.layers.0.self_attn.attn"
|
||||||
layer_1 = "model.layers.1.self_attn.attn"
|
layer_1 = "model.layers.1.self_attn.attn"
|
||||||
invalid_layer = "model.layers.0.cross_attn.attn"
|
invalid_layer = "model.layers.0.cross_attn.attn"
|
||||||
@@ -506,6 +508,7 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
|
|||||||
|
|
||||||
|
|
||||||
def test_init_kv_cache_with_kv_sharing_target_same_as_current():
|
def test_init_kv_cache_with_kv_sharing_target_same_as_current():
|
||||||
|
torch.set_default_dtype(torch.float16)
|
||||||
layer_0 = "model.layers.0.self_attn.attn"
|
layer_0 = "model.layers.0.self_attn.attn"
|
||||||
layer_1 = "model.layers.1.self_attn.attn"
|
layer_1 = "model.layers.1.self_attn.attn"
|
||||||
error_msg = f"{layer_1} cannot be the same as the current layer"
|
error_msg = f"{layer_1} cannot be the same as the current layer"
|
||||||
@@ -534,6 +537,7 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current():
|
|||||||
|
|
||||||
|
|
||||||
def test_init_kv_cache_without_kv_sharing():
|
def test_init_kv_cache_without_kv_sharing():
|
||||||
|
torch.set_default_dtype(torch.float16)
|
||||||
layer_0 = "model.layers.0.self_attn.attn"
|
layer_0 = "model.layers.0.self_attn.attn"
|
||||||
layer_1 = "model.layers.1.self_attn.attn"
|
layer_1 = "model.layers.1.self_attn.attn"
|
||||||
vllm_config = get_vllm_config()
|
vllm_config = get_vllm_config()
|
||||||
@@ -601,6 +605,7 @@ def test_init_kv_cache_without_kv_sharing():
|
|||||||
|
|
||||||
|
|
||||||
def test_init_kv_cache_with_kv_sharing_valid():
|
def test_init_kv_cache_with_kv_sharing_valid():
|
||||||
|
torch.set_default_dtype(torch.float16)
|
||||||
layer_0 = "model.layers.0.self_attn.attn"
|
layer_0 = "model.layers.0.self_attn.attn"
|
||||||
layer_1 = "model.layers.1.self_attn.attn"
|
layer_1 = "model.layers.1.self_attn.attn"
|
||||||
vllm_config = get_vllm_config()
|
vllm_config = get_vllm_config()
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|||||||
@@ -2,51 +2,146 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
SPDX_HEADER = (
|
|
||||||
|
class SPDXStatus(Enum):
|
||||||
|
"""SPDX header status enumeration"""
|
||||||
|
EMPTY = "empty" # empty __init__.py
|
||||||
|
COMPLETE = "complete"
|
||||||
|
MISSING_LICENSE = "missing_license" # Only has copyright line
|
||||||
|
MISSING_COPYRIGHT = "missing_copyright" # Only has license line
|
||||||
|
MISSING_BOTH = "missing_both" # Completely missing
|
||||||
|
|
||||||
|
|
||||||
|
FULL_SPDX_HEADER = (
|
||||||
"# SPDX-License-Identifier: Apache-2.0\n"
|
"# SPDX-License-Identifier: Apache-2.0\n"
|
||||||
"# SPDX-FileCopyrightText: Copyright contributors to the vLLM project")
|
"# SPDX-FileCopyrightText: Copyright contributors to the vLLM project")
|
||||||
SPDX_HEADER_PREFIX = "# SPDX-License-Identifier:"
|
|
||||||
|
LICENSE_LINE = "# SPDX-License-Identifier: Apache-2.0"
|
||||||
|
COPYRIGHT_LINE = "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project" # noqa: E501
|
||||||
|
|
||||||
|
|
||||||
def check_spdx_header(file_path):
|
def check_spdx_header_status(file_path):
|
||||||
with open(file_path, encoding='UTF-8') as file:
|
"""Check SPDX header status of the file"""
|
||||||
|
with open(file_path, encoding="UTF-8") as file:
|
||||||
lines = file.readlines()
|
lines = file.readlines()
|
||||||
if not lines:
|
if not lines:
|
||||||
# Empty file like __init__.py
|
# Empty file
|
||||||
return True
|
return SPDXStatus.EMPTY
|
||||||
for line in lines:
|
|
||||||
if line.strip().startswith(SPDX_HEADER_PREFIX):
|
# Skip shebang line
|
||||||
return True
|
start_idx = 0
|
||||||
return False
|
if lines and lines[0].startswith("#!"):
|
||||||
|
start_idx = 1
|
||||||
|
|
||||||
|
has_license = False
|
||||||
|
has_copyright = False
|
||||||
|
|
||||||
|
# Check all lines for SPDX headers (not just the first two)
|
||||||
|
for i in range(start_idx, len(lines)):
|
||||||
|
line = lines[i].strip()
|
||||||
|
if line == LICENSE_LINE:
|
||||||
|
has_license = True
|
||||||
|
elif line == COPYRIGHT_LINE:
|
||||||
|
has_copyright = True
|
||||||
|
|
||||||
|
# Determine status based on what we found
|
||||||
|
if has_license and has_copyright:
|
||||||
|
return SPDXStatus.COMPLETE
|
||||||
|
elif has_license and not has_copyright:
|
||||||
|
# Only has license line
|
||||||
|
return SPDXStatus.MISSING_COPYRIGHT
|
||||||
|
# Only has copyright line
|
||||||
|
elif not has_license and has_copyright:
|
||||||
|
return SPDXStatus.MISSING_LICENSE
|
||||||
|
else:
|
||||||
|
# Completely missing both lines
|
||||||
|
return SPDXStatus.MISSING_BOTH
|
||||||
|
|
||||||
|
|
||||||
def add_header(file_path):
|
def add_header(file_path, status):
|
||||||
with open(file_path, 'r+', encoding='UTF-8') as file:
|
"""Add or supplement SPDX header based on status"""
|
||||||
|
with open(file_path, "r+", encoding="UTF-8") as file:
|
||||||
lines = file.readlines()
|
lines = file.readlines()
|
||||||
file.seek(0, 0)
|
file.seek(0, 0)
|
||||||
if lines and lines[0].startswith("#!"):
|
file.truncate()
|
||||||
file.write(lines[0])
|
|
||||||
file.write(SPDX_HEADER + '\n')
|
if status == SPDXStatus.MISSING_BOTH:
|
||||||
file.writelines(lines[1:])
|
# Completely missing, add complete header
|
||||||
else:
|
if lines and lines[0].startswith("#!"):
|
||||||
file.write(SPDX_HEADER + '\n')
|
# Preserve shebang line
|
||||||
|
file.write(lines[0])
|
||||||
|
file.write(FULL_SPDX_HEADER + "\n")
|
||||||
|
file.writelines(lines[1:])
|
||||||
|
else:
|
||||||
|
# Add header directly
|
||||||
|
file.write(FULL_SPDX_HEADER + "\n")
|
||||||
|
file.writelines(lines)
|
||||||
|
|
||||||
|
elif status == SPDXStatus.MISSING_COPYRIGHT:
|
||||||
|
# Only has license line, need to add copyright line
|
||||||
|
# Find the license line and add copyright line after it
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
if line.strip() == LICENSE_LINE:
|
||||||
|
# Insert copyright line after license line
|
||||||
|
lines.insert(
|
||||||
|
i + 1,
|
||||||
|
f"{COPYRIGHT_LINE}\n",
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
file.writelines(lines)
|
||||||
|
|
||||||
|
elif status == SPDXStatus.MISSING_LICENSE:
|
||||||
|
# Only has copyright line, need to add license line
|
||||||
|
# Find the copyright line and add license line before it
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
if line.strip() == COPYRIGHT_LINE:
|
||||||
|
# Insert license line before copyright line
|
||||||
|
lines.insert(i, f"{LICENSE_LINE}\n")
|
||||||
|
break
|
||||||
file.writelines(lines)
|
file.writelines(lines)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
files_with_missing_header = []
|
"""Main function"""
|
||||||
|
files_missing_both = []
|
||||||
|
files_missing_copyright = []
|
||||||
|
files_missing_license = []
|
||||||
|
|
||||||
for file_path in sys.argv[1:]:
|
for file_path in sys.argv[1:]:
|
||||||
if not check_spdx_header(file_path):
|
status = check_spdx_header_status(file_path)
|
||||||
files_with_missing_header.append(file_path)
|
|
||||||
|
|
||||||
if files_with_missing_header:
|
if status == SPDXStatus.MISSING_BOTH:
|
||||||
|
files_missing_both.append(file_path)
|
||||||
|
elif status == SPDXStatus.MISSING_COPYRIGHT:
|
||||||
|
files_missing_copyright.append(file_path)
|
||||||
|
elif status == SPDXStatus.MISSING_LICENSE:
|
||||||
|
files_missing_license.append(file_path)
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Collect all files that need fixing
|
||||||
|
all_files_to_fix = (files_missing_both + files_missing_copyright +
|
||||||
|
files_missing_license)
|
||||||
|
if all_files_to_fix:
|
||||||
print("The following files are missing the SPDX header:")
|
print("The following files are missing the SPDX header:")
|
||||||
for file_path in files_with_missing_header:
|
if files_missing_both:
|
||||||
print(f" {file_path}")
|
for file_path in files_missing_both:
|
||||||
add_header(file_path)
|
print(f" {file_path}")
|
||||||
|
add_header(file_path, SPDXStatus.MISSING_BOTH)
|
||||||
|
|
||||||
sys.exit(1 if files_with_missing_header else 0)
|
if files_missing_copyright:
|
||||||
|
for file_path in files_missing_copyright:
|
||||||
|
print(f" {file_path}")
|
||||||
|
add_header(file_path, SPDXStatus.MISSING_COPYRIGHT)
|
||||||
|
if files_missing_license:
|
||||||
|
for file_path in files_missing_license:
|
||||||
|
print(f" {file_path}")
|
||||||
|
add_header(file_path, SPDXStatus.MISSING_LICENSE)
|
||||||
|
|
||||||
|
sys.exit(1 if all_files_to_fix else 0)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -646,6 +646,20 @@ def cutlass_scaled_mm_supports_fp4(cuda_device_capability: int) -> bool:
|
|||||||
return torch.ops._C.cutlass_scaled_mm_supports_fp4(cuda_device_capability)
|
return torch.ops._C.cutlass_scaled_mm_supports_fp4(cuda_device_capability)
|
||||||
|
|
||||||
|
|
||||||
|
def cutlass_blockwise_scaled_grouped_mm(
|
||||||
|
output: torch.Tensor,
|
||||||
|
a: torch.Tensor,
|
||||||
|
b: torch.Tensor,
|
||||||
|
scales_a: torch.Tensor,
|
||||||
|
scales_b: torch.Tensor,
|
||||||
|
problem_sizes: torch.Tensor,
|
||||||
|
expert_offsets: torch.Tensor,
|
||||||
|
):
|
||||||
|
torch.ops._C.cutlass_blockwise_scaled_grouped_mm(output, a, b, scales_a,
|
||||||
|
scales_b, problem_sizes,
|
||||||
|
expert_offsets)
|
||||||
|
|
||||||
|
|
||||||
def cutlass_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor,
|
def cutlass_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor,
|
||||||
block_scale_a: torch.Tensor,
|
block_scale_a: torch.Tensor,
|
||||||
block_scale_b: torch.Tensor, alpha: torch.Tensor,
|
block_scale_b: torch.Tensor, alpha: torch.Tensor,
|
||||||
|
|||||||
@@ -310,7 +310,8 @@ class MultiHeadAttention(nn.Module):
|
|||||||
# currently, only torch_sdpa is supported on rocm
|
# currently, only torch_sdpa is supported on rocm
|
||||||
self.attn_backend = _Backend.TORCH_SDPA
|
self.attn_backend = _Backend.TORCH_SDPA
|
||||||
else:
|
else:
|
||||||
if backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
|
if backend in (_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1,
|
||||||
|
_Backend.FLEX_ATTENTION):
|
||||||
backend = _Backend.XFORMERS
|
backend = _Backend.XFORMERS
|
||||||
|
|
||||||
self.attn_backend = backend if backend in {
|
self.attn_backend = backend if backend in {
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
import os
|
import os
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from functools import cache
|
from functools import cache
|
||||||
from typing import Generator, Optional, Type
|
from typing import Generator, Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@@ -79,6 +79,33 @@ def get_global_forced_attn_backend() -> Optional[_Backend]:
|
|||||||
return forced_attn_backend
|
return forced_attn_backend
|
||||||
|
|
||||||
|
|
||||||
|
def supports_head_size(
|
||||||
|
attn_backend: Union[str, type[AttentionBackend]],
|
||||||
|
head_size: int,
|
||||||
|
) -> bool:
|
||||||
|
if isinstance(attn_backend, str):
|
||||||
|
try:
|
||||||
|
attn_backend = resolve_obj_by_qualname(attn_backend)
|
||||||
|
except ImportError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
assert isinstance(attn_backend, type)
|
||||||
|
|
||||||
|
# TODO: Update the interface once V0 is removed
|
||||||
|
if get_supported_head_sizes := getattr(attn_backend,
|
||||||
|
"get_supported_head_sizes", None):
|
||||||
|
return head_size in get_supported_head_sizes()
|
||||||
|
if validate_head_size := getattr(attn_backend, "validate_head_size", None):
|
||||||
|
try:
|
||||||
|
validate_head_size(head_size)
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
raise NotImplementedError(f"{attn_backend.__name__} does not support "
|
||||||
|
"head size validation")
|
||||||
|
|
||||||
|
|
||||||
def get_attn_backend(
|
def get_attn_backend(
|
||||||
head_size: int,
|
head_size: int,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
@@ -87,7 +114,7 @@ def get_attn_backend(
|
|||||||
is_attention_free: bool,
|
is_attention_free: bool,
|
||||||
is_blocksparse: bool = False,
|
is_blocksparse: bool = False,
|
||||||
use_mla: bool = False,
|
use_mla: bool = False,
|
||||||
) -> Type[AttentionBackend]:
|
) -> type[AttentionBackend]:
|
||||||
"""Selects which attention backend to use and lazily imports it."""
|
"""Selects which attention backend to use and lazily imports it."""
|
||||||
# Accessing envs.* behind an @lru_cache decorator can cause the wrong
|
# Accessing envs.* behind an @lru_cache decorator can cause the wrong
|
||||||
# value to be returned from the cache if the value changes between calls.
|
# value to be returned from the cache if the value changes between calls.
|
||||||
@@ -115,7 +142,7 @@ def _cached_get_attn_backend(
|
|||||||
is_blocksparse: bool = False,
|
is_blocksparse: bool = False,
|
||||||
use_v1: bool = False,
|
use_v1: bool = False,
|
||||||
use_mla: bool = False,
|
use_mla: bool = False,
|
||||||
) -> Type[AttentionBackend]:
|
) -> type[AttentionBackend]:
|
||||||
if is_blocksparse:
|
if is_blocksparse:
|
||||||
logger.info("Using BlocksparseFlashAttention backend.")
|
logger.info("Using BlocksparseFlashAttention backend.")
|
||||||
from vllm.attention.backends.blocksparse_attn import (
|
from vllm.attention.backends.blocksparse_attn import (
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch._inductor.pattern_matcher as pm
|
import torch._inductor.pattern_matcher as pm
|
||||||
|
|||||||
@@ -466,6 +466,9 @@ class ModelConfig:
|
|||||||
"affect the random state of the Python process that "
|
"affect the random state of the Python process that "
|
||||||
"launched vLLM.", self.seed)
|
"launched vLLM.", self.seed)
|
||||||
|
|
||||||
|
# Keep set served_model_name before maybe_model_redirect(self.model)
|
||||||
|
self.served_model_name = get_served_model_name(self.model,
|
||||||
|
self.served_model_name)
|
||||||
self.model = maybe_model_redirect(self.model)
|
self.model = maybe_model_redirect(self.model)
|
||||||
# The tokenizer is consistent with the model by default.
|
# The tokenizer is consistent with the model by default.
|
||||||
if self.tokenizer is None:
|
if self.tokenizer is None:
|
||||||
@@ -609,8 +612,6 @@ class ModelConfig:
|
|||||||
|
|
||||||
self.original_max_model_len = self.max_model_len
|
self.original_max_model_len = self.max_model_len
|
||||||
self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
|
self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
|
||||||
self.served_model_name = get_served_model_name(self.model,
|
|
||||||
self.served_model_name)
|
|
||||||
self.multimodal_config = self._init_multimodal_config()
|
self.multimodal_config = self._init_multimodal_config()
|
||||||
if not self.skip_tokenizer_init:
|
if not self.skip_tokenizer_init:
|
||||||
self._verify_tokenizer_mode()
|
self._verify_tokenizer_mode()
|
||||||
@@ -1420,7 +1421,7 @@ class ModelConfig:
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def is_cross_encoder(self) -> bool:
|
def is_cross_encoder(self) -> bool:
|
||||||
return self.registry.is_cross_encoder_model(self.architectures)
|
return self.task == "classify"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def use_mla(self) -> bool:
|
def use_mla(self) -> bool:
|
||||||
@@ -2318,7 +2319,7 @@ class SchedulerConfig:
|
|||||||
|
|
||||||
if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len:
|
if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"max_num_batched_tokens (%d) exceeds max_num_seqs"
|
"max_num_batched_tokens (%d) exceeds max_num_seqs "
|
||||||
"* max_model_len (%d). This may lead to unexpected behavior.",
|
"* max_model_len (%d). This may lead to unexpected behavior.",
|
||||||
self.max_num_batched_tokens,
|
self.max_num_batched_tokens,
|
||||||
self.max_num_seqs * self.max_model_len)
|
self.max_num_seqs * self.max_model_len)
|
||||||
@@ -4762,6 +4763,12 @@ class VllmConfig:
|
|||||||
if cls is not None:
|
if cls is not None:
|
||||||
cls.verify_and_update_config(self)
|
cls.verify_and_update_config(self)
|
||||||
|
|
||||||
|
if self.model_config.task == "classify":
|
||||||
|
# Maybe convert ForCausalLM into ForSequenceClassification model.
|
||||||
|
from vllm.model_executor.models.adapters import (
|
||||||
|
SequenceClassificationConfig)
|
||||||
|
SequenceClassificationConfig.verify_and_update_config(self)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return (
|
return (
|
||||||
f"model={self.model_config.model!r},"
|
f"model={self.model_config.model!r},"
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
'''
|
'''
|
||||||
Expert parallelism load balancer (EPLB).
|
Expert parallelism load balancer (EPLB).
|
||||||
'''
|
'''
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""
|
"""
|
||||||
Expert parallelism load balancer (EPLB) metrics and states.
|
Expert parallelism load balancer (EPLB) metrics and states.
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""
|
"""
|
||||||
Expert parallelism load balancer (EPLB) for vLLM.
|
Expert parallelism load balancer (EPLB) for vLLM.
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""
|
"""
|
||||||
The actual execution of the rearrangement.
|
The actual execution of the rearrangement.
|
||||||
|
|
||||||
|
|||||||
@@ -97,10 +97,10 @@ def get_kv_connector_cache_layout():
|
|||||||
# used for faster transfer.
|
# used for faster transfer.
|
||||||
vllm_config = get_current_vllm_config()
|
vllm_config = get_current_vllm_config()
|
||||||
kv_config = vllm_config.kv_transfer_config
|
kv_config = vllm_config.kv_transfer_config
|
||||||
if vllm_config.model_config is None or kv_config is None:
|
if kv_config is not None and vllm_config.model_config is None:
|
||||||
logger.warning_once("Unable to detect current VLLM config. " \
|
logger.warning_once("Unable to detect current VLLM config. " \
|
||||||
"Defaulting to NHD kv cache layout.")
|
"Defaulting to NHD kv cache layout.")
|
||||||
else:
|
elif kv_config is not None:
|
||||||
use_mla = vllm_config.model_config.use_mla
|
use_mla = vllm_config.model_config.use_mla
|
||||||
if not use_mla and kv_config.kv_connector == "NixlConnector":
|
if not use_mla and kv_config.kv_connector == "NixlConnector":
|
||||||
logger.info_once("NixlConnector detected. Setting KV cache " \
|
logger.info_once("NixlConnector detected. Setting KV cache " \
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import TYPE_CHECKING, Any, Optional
|
from typing import TYPE_CHECKING, Any, Optional
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import atexit
|
import atexit
|
||||||
import ctypes
|
import ctypes
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
|||||||
@@ -1393,13 +1393,6 @@ class EngineArgs:
|
|||||||
recommend_to_remove=False)
|
recommend_to_remove=False)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Only Fp16 and Bf16 dtypes since we only support FA.
|
|
||||||
V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
|
|
||||||
if model_config.dtype not in V1_SUPPORTED_DTYPES:
|
|
||||||
_raise_or_fallback(feature_name=f"--dtype {model_config.dtype}",
|
|
||||||
recommend_to_remove=False)
|
|
||||||
return False
|
|
||||||
|
|
||||||
# No Mamba or Encoder-Decoder so far.
|
# No Mamba or Encoder-Decoder so far.
|
||||||
if not model_config.is_v1_compatible:
|
if not model_config.is_v1_compatible:
|
||||||
_raise_or_fallback(feature_name=model_config.architectures,
|
_raise_or_fallback(feature_name=model_config.architectures,
|
||||||
|
|||||||
@@ -28,7 +28,8 @@ from openai.types.chat import (ChatCompletionMessageToolCallParam,
|
|||||||
ChatCompletionToolMessageParam)
|
ChatCompletionToolMessageParam)
|
||||||
from openai.types.chat.chat_completion_content_part_input_audio_param import (
|
from openai.types.chat.chat_completion_content_part_input_audio_param import (
|
||||||
InputAudio)
|
InputAudio)
|
||||||
from pydantic import TypeAdapter
|
from PIL import Image
|
||||||
|
from pydantic import BaseModel, ConfigDict, TypeAdapter
|
||||||
# yapf: enable
|
# yapf: enable
|
||||||
from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast,
|
from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast,
|
||||||
ProcessorMixin)
|
ProcessorMixin)
|
||||||
@@ -91,6 +92,25 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):
|
|||||||
"""The type of the content part."""
|
"""The type of the content part."""
|
||||||
|
|
||||||
|
|
||||||
|
class PILImage(BaseModel):
|
||||||
|
"""
|
||||||
|
A PIL.Image.Image object.
|
||||||
|
"""
|
||||||
|
image_pil: Image.Image
|
||||||
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||||
|
|
||||||
|
|
||||||
|
class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
|
||||||
|
"""A simpler version of the param that only accepts a PIL image.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
{
|
||||||
|
"image_pil": ImageAsset('cherry_blossom').pil_image
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
image_pil: Required[PILImage]
|
||||||
|
|
||||||
|
|
||||||
class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
|
class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
|
||||||
"""A simpler version of the param that only accepts a plain image_url.
|
"""A simpler version of the param that only accepts a plain image_url.
|
||||||
This is supported by OpenAI API, although it is not documented.
|
This is supported by OpenAI API, although it is not documented.
|
||||||
@@ -129,6 +149,7 @@ ChatCompletionContentPartParam: TypeAlias = Union[
|
|||||||
OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
|
OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
|
||||||
ChatCompletionContentPartInputAudioParam,
|
ChatCompletionContentPartInputAudioParam,
|
||||||
ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
|
ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
|
||||||
|
CustomChatCompletionContentPILImageParam,
|
||||||
CustomChatCompletionContentSimpleImageParam,
|
CustomChatCompletionContentSimpleImageParam,
|
||||||
ChatCompletionContentPartImageEmbedsParam,
|
ChatCompletionContentPartImageEmbedsParam,
|
||||||
CustomChatCompletionContentSimpleAudioParam,
|
CustomChatCompletionContentSimpleAudioParam,
|
||||||
@@ -631,6 +652,10 @@ class BaseMultiModalContentParser(ABC):
|
|||||||
image_embeds: Union[str, dict[str, str]]) -> None:
|
image_embeds: Union[str, dict[str, str]]) -> None:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def parse_image_pil(self, image_pil: Image.Image) -> None:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def parse_audio(self, audio_url: str) -> None:
|
def parse_audio(self, audio_url: str) -> None:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
@@ -677,6 +702,10 @@ class MultiModalContentParser(BaseMultiModalContentParser):
|
|||||||
|
|
||||||
self._add_placeholder(placeholder)
|
self._add_placeholder(placeholder)
|
||||||
|
|
||||||
|
def parse_image_pil(self, image_pil: Image.Image) -> None:
|
||||||
|
placeholder = self._tracker.add("image", image_pil)
|
||||||
|
self._add_placeholder(placeholder)
|
||||||
|
|
||||||
def parse_audio(self, audio_url: str) -> None:
|
def parse_audio(self, audio_url: str) -> None:
|
||||||
audio = self._connector.fetch_audio(audio_url)
|
audio = self._connector.fetch_audio(audio_url)
|
||||||
|
|
||||||
@@ -733,6 +762,13 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
|
|||||||
placeholder = self._tracker.add("image_embeds", future)
|
placeholder = self._tracker.add("image_embeds", future)
|
||||||
self._add_placeholder(placeholder)
|
self._add_placeholder(placeholder)
|
||||||
|
|
||||||
|
def parse_image_pil(self, image_pil: Image.Image) -> None:
|
||||||
|
future: asyncio.Future[Image.Image] = asyncio.Future()
|
||||||
|
future.set_result(image_pil)
|
||||||
|
|
||||||
|
placeholder = self._tracker.add("image", future)
|
||||||
|
self._add_placeholder(placeholder)
|
||||||
|
|
||||||
def parse_audio(self, audio_url: str) -> None:
|
def parse_audio(self, audio_url: str) -> None:
|
||||||
audio_coro = self._connector.fetch_audio_async(audio_url)
|
audio_coro = self._connector.fetch_audio_async(audio_url)
|
||||||
|
|
||||||
@@ -851,12 +887,13 @@ _TextParser = partial(cast, ChatCompletionContentPartTextParam)
|
|||||||
_ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
|
_ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
|
||||||
_InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
|
_InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
|
||||||
_RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
|
_RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
|
||||||
|
_PILImageParser = partial(cast, CustomChatCompletionContentPILImageParam)
|
||||||
# Need to validate url objects
|
# Need to validate url objects
|
||||||
_ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python
|
_ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python
|
||||||
_AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
|
_AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
|
||||||
_VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python
|
_VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python
|
||||||
|
|
||||||
_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio]
|
_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage]
|
||||||
|
|
||||||
# Define a mapping from part types to their corresponding parsing functions.
|
# Define a mapping from part types to their corresponding parsing functions.
|
||||||
MM_PARSER_MAP: dict[
|
MM_PARSER_MAP: dict[
|
||||||
@@ -869,6 +906,7 @@ MM_PARSER_MAP: dict[
|
|||||||
lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
|
lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
|
||||||
"image_embeds":
|
"image_embeds":
|
||||||
lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
|
lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
|
||||||
|
"image_pil": lambda part: _PILImageParser(part).get("image_pil", None),
|
||||||
"audio_url":
|
"audio_url":
|
||||||
lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
|
lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
|
||||||
"input_audio":
|
"input_audio":
|
||||||
@@ -938,7 +976,7 @@ def _parse_chat_message_content_mm_part(
|
|||||||
|
|
||||||
|
|
||||||
VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
|
VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
|
||||||
"image_embeds",
|
"image_embeds", "image_pil",
|
||||||
"audio_url", "input_audio", "video_url")
|
"audio_url", "input_audio", "video_url")
|
||||||
|
|
||||||
|
|
||||||
@@ -1009,6 +1047,10 @@ def _parse_chat_message_content_part(
|
|||||||
else:
|
else:
|
||||||
return str_content
|
return str_content
|
||||||
|
|
||||||
|
if part_type == "image_pil":
|
||||||
|
image_content = cast(Image.Image, content)
|
||||||
|
mm_parser.parse_image_pil(image_content)
|
||||||
|
return {'type': 'image'} if wrap_dicts else None
|
||||||
if part_type == "image_url":
|
if part_type == "image_url":
|
||||||
str_content = cast(str, content)
|
str_content = cast(str, content)
|
||||||
mm_parser.parse_image(str_content)
|
mm_parser.parse_image(str_content)
|
||||||
|
|||||||
@@ -1204,7 +1204,7 @@ class LLM:
|
|||||||
|
|
||||||
input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]
|
input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]
|
||||||
|
|
||||||
pooling_params = PoolingParams()
|
pooling_params = PoolingParams(use_cross_encoder=True)
|
||||||
|
|
||||||
tokenization_kwargs: dict[str, Any] = {}
|
tokenization_kwargs: dict[str, Any] = {}
|
||||||
_validate_truncation_size(self.llm_engine.model_config.max_model_len,
|
_validate_truncation_size(self.llm_engine.model_config.max_model_len,
|
||||||
|
|||||||
@@ -910,6 +910,8 @@ TASK_HANDLERS: dict[str, dict[str, tuple]] = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if envs.VLLM_SERVER_DEV_MODE:
|
if envs.VLLM_SERVER_DEV_MODE:
|
||||||
|
logger.warning("SECURITY WARNING: Development endpoints are enabled! "
|
||||||
|
"This should NOT be used in production!")
|
||||||
|
|
||||||
@router.get("/server_info")
|
@router.get("/server_info")
|
||||||
async def show_server_info(raw_request: Request):
|
async def show_server_info(raw_request: Request):
|
||||||
|
|||||||
@@ -229,7 +229,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
|||||||
logit_bias: Optional[dict[str, float]] = None
|
logit_bias: Optional[dict[str, float]] = None
|
||||||
logprobs: Optional[bool] = False
|
logprobs: Optional[bool] = False
|
||||||
top_logprobs: Optional[int] = 0
|
top_logprobs: Optional[int] = 0
|
||||||
# TODO(#9845): remove max_tokens when field is removed from OpenAI API
|
|
||||||
max_tokens: Optional[int] = Field(
|
max_tokens: Optional[int] = Field(
|
||||||
default=None,
|
default=None,
|
||||||
deprecated=
|
deprecated=
|
||||||
@@ -433,23 +432,10 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
|||||||
}
|
}
|
||||||
|
|
||||||
def to_beam_search_params(
|
def to_beam_search_params(
|
||||||
self,
|
self, max_tokens: int,
|
||||||
default_max_tokens: int,
|
default_sampling_params: dict) -> BeamSearchParams:
|
||||||
default_sampling_params: Optional[dict] = None
|
|
||||||
) -> BeamSearchParams:
|
|
||||||
# TODO(#9845): remove max_tokens when field is removed from OpenAI API
|
|
||||||
max_tokens = self.max_completion_tokens or self.max_tokens
|
|
||||||
|
|
||||||
if default_sampling_params is None:
|
|
||||||
default_sampling_params = {}
|
|
||||||
n = self.n if self.n is not None else 1
|
n = self.n if self.n is not None else 1
|
||||||
|
|
||||||
# Use minimum of context window, user request & server limit.
|
|
||||||
max_tokens = min(
|
|
||||||
val for val in (default_max_tokens, max_tokens,
|
|
||||||
default_sampling_params.get("max_tokens", None))
|
|
||||||
if val is not None)
|
|
||||||
|
|
||||||
if (temperature := self.temperature) is None:
|
if (temperature := self.temperature) is None:
|
||||||
temperature = default_sampling_params.get(
|
temperature = default_sampling_params.get(
|
||||||
"temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
|
"temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
|
||||||
@@ -465,21 +451,10 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
|||||||
|
|
||||||
def to_sampling_params(
|
def to_sampling_params(
|
||||||
self,
|
self,
|
||||||
default_max_tokens: int,
|
max_tokens: int,
|
||||||
logits_processor_pattern: Optional[str],
|
logits_processor_pattern: Optional[str],
|
||||||
default_sampling_params: Optional[dict] = None,
|
default_sampling_params: dict,
|
||||||
) -> SamplingParams:
|
) -> SamplingParams:
|
||||||
# TODO(#9845): remove max_tokens when field is removed from OpenAI API
|
|
||||||
max_tokens = self.max_completion_tokens or self.max_tokens
|
|
||||||
|
|
||||||
if default_sampling_params is None:
|
|
||||||
default_sampling_params = {}
|
|
||||||
|
|
||||||
# Use minimum of context window, user request & server limit.
|
|
||||||
max_tokens = min(
|
|
||||||
val for val in (default_max_tokens, max_tokens,
|
|
||||||
default_sampling_params.get("max_tokens", None))
|
|
||||||
if val is not None)
|
|
||||||
|
|
||||||
# Default parameters
|
# Default parameters
|
||||||
if (repetition_penalty := self.repetition_penalty) is None:
|
if (repetition_penalty := self.repetition_penalty) is None:
|
||||||
@@ -898,22 +873,15 @@ class CompletionRequest(OpenAIBaseModel):
|
|||||||
}
|
}
|
||||||
|
|
||||||
def to_beam_search_params(
|
def to_beam_search_params(
|
||||||
self,
|
self,
|
||||||
default_max_tokens: int,
|
max_tokens: int,
|
||||||
default_sampling_params: Optional[dict] = None
|
default_sampling_params: Optional[dict] = None,
|
||||||
) -> BeamSearchParams:
|
) -> BeamSearchParams:
|
||||||
max_tokens = self.max_tokens
|
|
||||||
|
|
||||||
if default_sampling_params is None:
|
if default_sampling_params is None:
|
||||||
default_sampling_params = {}
|
default_sampling_params = {}
|
||||||
n = self.n if self.n is not None else 1
|
n = self.n if self.n is not None else 1
|
||||||
|
|
||||||
# Use minimum of context window, user request & server limit.
|
|
||||||
max_tokens = min(
|
|
||||||
val for val in (default_max_tokens, max_tokens,
|
|
||||||
default_sampling_params.get("max_tokens", None))
|
|
||||||
if val is not None)
|
|
||||||
|
|
||||||
if (temperature := self.temperature) is None:
|
if (temperature := self.temperature) is None:
|
||||||
temperature = default_sampling_params.get("temperature", 1.0)
|
temperature = default_sampling_params.get("temperature", 1.0)
|
||||||
|
|
||||||
@@ -928,21 +896,14 @@ class CompletionRequest(OpenAIBaseModel):
|
|||||||
|
|
||||||
def to_sampling_params(
|
def to_sampling_params(
|
||||||
self,
|
self,
|
||||||
default_max_tokens: int,
|
max_tokens: int,
|
||||||
logits_processor_pattern: Optional[str],
|
logits_processor_pattern: Optional[str],
|
||||||
default_sampling_params: Optional[dict] = None,
|
default_sampling_params: Optional[dict] = None,
|
||||||
) -> SamplingParams:
|
) -> SamplingParams:
|
||||||
max_tokens = self.max_tokens
|
|
||||||
|
|
||||||
if default_sampling_params is None:
|
if default_sampling_params is None:
|
||||||
default_sampling_params = {}
|
default_sampling_params = {}
|
||||||
|
|
||||||
# Use minimum of context window, user request & server limit.
|
|
||||||
max_tokens = min(
|
|
||||||
val for val in (default_max_tokens, max_tokens,
|
|
||||||
default_sampling_params.get("max_tokens", None))
|
|
||||||
if val is not None)
|
|
||||||
|
|
||||||
# Default parameters
|
# Default parameters
|
||||||
if (repetition_penalty := self.repetition_penalty) is None:
|
if (repetition_penalty := self.repetition_penalty) is None:
|
||||||
repetition_penalty = default_sampling_params.get(
|
repetition_penalty = default_sampling_params.get(
|
||||||
@@ -1195,8 +1156,9 @@ class ScoreRequest(OpenAIBaseModel):
|
|||||||
|
|
||||||
# --8<-- [end:score-extra-params]
|
# --8<-- [end:score-extra-params]
|
||||||
|
|
||||||
def to_pooling_params(self):
|
def to_pooling_params(self, *, use_cross_encoder: bool = False):
|
||||||
return PoolingParams(additional_data=self.additional_data)
|
return PoolingParams(use_cross_encoder=use_cross_encoder,
|
||||||
|
additional_data=self.additional_data)
|
||||||
|
|
||||||
|
|
||||||
class RerankRequest(OpenAIBaseModel):
|
class RerankRequest(OpenAIBaseModel):
|
||||||
@@ -1221,8 +1183,9 @@ class RerankRequest(OpenAIBaseModel):
|
|||||||
|
|
||||||
# --8<-- [end:rerank-extra-params]
|
# --8<-- [end:rerank-extra-params]
|
||||||
|
|
||||||
def to_pooling_params(self):
|
def to_pooling_params(self, *, use_cross_encoder: bool = False):
|
||||||
return PoolingParams(additional_data=self.additional_data)
|
return PoolingParams(use_cross_encoder=use_cross_encoder,
|
||||||
|
additional_data=self.additional_data)
|
||||||
|
|
||||||
|
|
||||||
class RerankDocument(BaseModel):
|
class RerankDocument(BaseModel):
|
||||||
@@ -1813,7 +1776,7 @@ class TranscriptionRequest(OpenAIBaseModel):
|
|||||||
self,
|
self,
|
||||||
default_max_tokens: int,
|
default_max_tokens: int,
|
||||||
default_sampling_params: Optional[dict] = None) -> SamplingParams:
|
default_sampling_params: Optional[dict] = None) -> SamplingParams:
|
||||||
# TODO(#9845): remove max_tokens when field is removed from OpenAI API
|
|
||||||
max_tokens = default_max_tokens
|
max_tokens = default_max_tokens
|
||||||
|
|
||||||
if default_sampling_params is None:
|
if default_sampling_params is None:
|
||||||
@@ -2029,7 +1992,7 @@ class TranslationRequest(OpenAIBaseModel):
|
|||||||
self,
|
self,
|
||||||
default_max_tokens: int,
|
default_max_tokens: int,
|
||||||
default_sampling_params: Optional[dict] = None) -> SamplingParams:
|
default_sampling_params: Optional[dict] = None) -> SamplingParams:
|
||||||
# TODO(#9845): remove max_tokens when field is removed from OpenAI API
|
|
||||||
max_tokens = default_max_tokens
|
max_tokens = default_max_tokens
|
||||||
|
|
||||||
if default_sampling_params is None:
|
if default_sampling_params is None:
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
|||||||
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
|
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
|
||||||
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
|
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
|
||||||
MistralToolCall)
|
MistralToolCall)
|
||||||
|
from vllm.entrypoints.utils import get_max_tokens
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.outputs import CompletionOutput, RequestOutput
|
from vllm.outputs import CompletionOutput, RequestOutput
|
||||||
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
||||||
@@ -233,15 +234,22 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
try:
|
try:
|
||||||
for i, engine_prompt in enumerate(engine_prompts):
|
for i, engine_prompt in enumerate(engine_prompts):
|
||||||
sampling_params: Union[SamplingParams, BeamSearchParams]
|
sampling_params: Union[SamplingParams, BeamSearchParams]
|
||||||
default_max_tokens = self.max_model_len - len(
|
|
||||||
engine_prompt["prompt_token_ids"])
|
if self.default_sampling_params is None:
|
||||||
|
self.default_sampling_params = {}
|
||||||
|
|
||||||
|
max_tokens = get_max_tokens(
|
||||||
|
max_model_len=self.max_model_len,
|
||||||
|
request=request,
|
||||||
|
input_length=len(engine_prompt["prompt_token_ids"]),
|
||||||
|
default_sampling_params=self.default_sampling_params)
|
||||||
|
|
||||||
if request.use_beam_search:
|
if request.use_beam_search:
|
||||||
sampling_params = request.to_beam_search_params(
|
sampling_params = request.to_beam_search_params(
|
||||||
default_max_tokens, self.default_sampling_params)
|
max_tokens, self.default_sampling_params)
|
||||||
else:
|
else:
|
||||||
sampling_params = request.to_sampling_params(
|
sampling_params = request.to_sampling_params(
|
||||||
default_max_tokens,
|
max_tokens, self.model_config.logits_processor_pattern,
|
||||||
self.model_config.logits_processor_pattern,
|
|
||||||
self.default_sampling_params)
|
self.default_sampling_params)
|
||||||
|
|
||||||
self._log_inputs(request_id,
|
self._log_inputs(request_id,
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
|
|||||||
is_text_tokens_prompt)
|
is_text_tokens_prompt)
|
||||||
# yapf: enable
|
# yapf: enable
|
||||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||||
|
from vllm.entrypoints.utils import get_max_tokens
|
||||||
from vllm.inputs.data import (EmbedsPrompt, TokensPrompt, is_embeds_prompt,
|
from vllm.inputs.data import (EmbedsPrompt, TokensPrompt, is_embeds_prompt,
|
||||||
is_tokens_prompt)
|
is_tokens_prompt)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
@@ -160,15 +161,22 @@ class OpenAIServingCompletion(OpenAIServing):
|
|||||||
input_length = len(engine_prompt["prompt_token_ids"])
|
input_length = len(engine_prompt["prompt_token_ids"])
|
||||||
else:
|
else:
|
||||||
assert_never(engine_prompt)
|
assert_never(engine_prompt)
|
||||||
default_max_tokens = self.max_model_len - input_length
|
|
||||||
|
if self.default_sampling_params is None:
|
||||||
|
self.default_sampling_params = {}
|
||||||
|
|
||||||
|
max_tokens = get_max_tokens(
|
||||||
|
max_model_len=self.max_model_len,
|
||||||
|
request=request,
|
||||||
|
input_length=input_length,
|
||||||
|
default_sampling_params=self.default_sampling_params)
|
||||||
|
|
||||||
if request.use_beam_search:
|
if request.use_beam_search:
|
||||||
sampling_params = request.to_beam_search_params(
|
sampling_params = request.to_beam_search_params(
|
||||||
default_max_tokens, self.default_sampling_params)
|
max_tokens, self.default_sampling_params)
|
||||||
else:
|
else:
|
||||||
sampling_params = request.to_sampling_params(
|
sampling_params = request.to_sampling_params(
|
||||||
default_max_tokens,
|
max_tokens, self.model_config.logits_processor_pattern,
|
||||||
self.model_config.logits_processor_pattern,
|
|
||||||
self.default_sampling_params)
|
self.default_sampling_params)
|
||||||
|
|
||||||
request_id_item = f"{request_id}-{i}"
|
request_id_item = f"{request_id}-{i}"
|
||||||
|
|||||||
@@ -25,9 +25,7 @@ from vllm.logger import init_logger
|
|||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
|
from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
|
||||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||||
from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
|
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
||||||
PreTrainedTokenizer,
|
|
||||||
PreTrainedTokenizerFast)
|
|
||||||
from vllm.utils import make_async, merge_async_iterators
|
from vllm.utils import make_async, merge_async_iterators
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
@@ -50,7 +48,7 @@ class ServingScores(OpenAIServing):
|
|||||||
|
|
||||||
async def _embedding_score(
|
async def _embedding_score(
|
||||||
self,
|
self,
|
||||||
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
|
tokenizer: AnyTokenizer,
|
||||||
texts_1: list[str],
|
texts_1: list[str],
|
||||||
texts_2: list[str],
|
texts_2: list[str],
|
||||||
request: Union[RerankRequest, ScoreRequest],
|
request: Union[RerankRequest, ScoreRequest],
|
||||||
@@ -141,7 +139,7 @@ class ServingScores(OpenAIServing):
|
|||||||
|
|
||||||
async def _cross_encoding_score(
|
async def _cross_encoding_score(
|
||||||
self,
|
self,
|
||||||
tokenizer: Union[AnyTokenizer],
|
tokenizer: AnyTokenizer,
|
||||||
texts_1: list[str],
|
texts_1: list[str],
|
||||||
texts_2: list[str],
|
texts_2: list[str],
|
||||||
request: Union[RerankRequest, ScoreRequest],
|
request: Union[RerankRequest, ScoreRequest],
|
||||||
@@ -190,7 +188,7 @@ class ServingScores(OpenAIServing):
|
|||||||
# Schedule the request and get the result generator.
|
# Schedule the request and get the result generator.
|
||||||
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
|
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
|
||||||
|
|
||||||
pooling_params = request.to_pooling_params()
|
pooling_params = request.to_pooling_params(use_cross_encoder=True)
|
||||||
|
|
||||||
for i, engine_prompt in enumerate(engine_prompts):
|
for i, engine_prompt in enumerate(engine_prompts):
|
||||||
request_id_item = f"{request_id}-{i}"
|
request_id_item = f"{request_id}-{i}"
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ from typing import Union
|
|||||||
|
|
||||||
import regex as re
|
import regex as re
|
||||||
|
|
||||||
|
from vllm.entrypoints.chat_utils import random_tool_call_id
|
||||||
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||||
DeltaFunctionCall, DeltaMessage,
|
DeltaFunctionCall, DeltaMessage,
|
||||||
DeltaToolCall,
|
DeltaToolCall,
|
||||||
@@ -15,7 +16,6 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
|
|||||||
ToolParser, ToolParserManager)
|
ToolParser, ToolParserManager)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
from vllm.utils import random_uuid
|
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
@@ -267,7 +267,7 @@ class DeepSeekV3ToolParser(ToolParser):
|
|||||||
DeltaToolCall(
|
DeltaToolCall(
|
||||||
index=self.current_tool_id,
|
index=self.current_tool_id,
|
||||||
type="function",
|
type="function",
|
||||||
id=f"chatcmpl-tool-{random_uuid()}",
|
id=random_tool_call_id(),
|
||||||
function=DeltaFunctionCall(
|
function=DeltaFunctionCall(
|
||||||
name=function_name).model_dump(
|
name=function_name).model_dump(
|
||||||
exclude_none=True),
|
exclude_none=True),
|
||||||
|
|||||||
@@ -1,11 +1,13 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
# ruff: noqa
|
# ruff: noqa
|
||||||
import json
|
import json
|
||||||
from collections.abc import Sequence
|
from collections.abc import Sequence
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Any, Optional, Union
|
||||||
|
|
||||||
import regex as re
|
import regex as re
|
||||||
|
|
||||||
|
from vllm.entrypoints.chat_utils import random_tool_call_id
|
||||||
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||||
DeltaFunctionCall, DeltaMessage,
|
DeltaFunctionCall, DeltaMessage,
|
||||||
DeltaToolCall,
|
DeltaToolCall,
|
||||||
@@ -224,7 +226,7 @@ class xLAMToolParser(ToolParser):
|
|||||||
function_name = name_match.group(1)
|
function_name = name_match.group(1)
|
||||||
|
|
||||||
# The test expects us to send just the name first
|
# The test expects us to send just the name first
|
||||||
tool_id = f"chatcmpl-tool-{random_uuid()}"
|
tool_id = random_tool_call_id()
|
||||||
delta = DeltaMessage(tool_calls=[
|
delta = DeltaMessage(tool_calls=[
|
||||||
DeltaToolCall(
|
DeltaToolCall(
|
||||||
index=0,
|
index=0,
|
||||||
|
|||||||
@@ -5,13 +5,17 @@ import argparse
|
|||||||
import asyncio
|
import asyncio
|
||||||
import functools
|
import functools
|
||||||
import os
|
import os
|
||||||
from typing import Any, Optional
|
import sys
|
||||||
|
from typing import Any, Optional, Union
|
||||||
|
|
||||||
from fastapi import Request
|
from fastapi import Request
|
||||||
from fastapi.responses import JSONResponse, StreamingResponse
|
from fastapi.responses import JSONResponse, StreamingResponse
|
||||||
from starlette.background import BackgroundTask, BackgroundTasks
|
from starlette.background import BackgroundTask, BackgroundTasks
|
||||||
|
|
||||||
|
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||||
|
CompletionRequest)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
@@ -181,7 +185,6 @@ def _validate_truncation_size(
|
|||||||
|
|
||||||
def show_filtered_argument_or_group_from_help(parser: argparse.ArgumentParser,
|
def show_filtered_argument_or_group_from_help(parser: argparse.ArgumentParser,
|
||||||
subcommand_name: list[str]):
|
subcommand_name: list[str]):
|
||||||
import sys
|
|
||||||
|
|
||||||
# Only handle --help=<keyword> for the current subcommand.
|
# Only handle --help=<keyword> for the current subcommand.
|
||||||
# Since subparser_init() runs for all subcommands during CLI setup,
|
# Since subparser_init() runs for all subcommands during CLI setup,
|
||||||
@@ -242,3 +245,18 @@ def show_filtered_argument_or_group_from_help(parser: argparse.ArgumentParser,
|
|||||||
print(f"\nNo group or parameter matching '{search_keyword}'")
|
print(f"\nNo group or parameter matching '{search_keyword}'")
|
||||||
print("Tip: use `--help=listgroup` to view all groups.")
|
print("Tip: use `--help=listgroup` to view all groups.")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def get_max_tokens(max_model_len: int, request: Union[ChatCompletionRequest,
|
||||||
|
CompletionRequest],
|
||||||
|
input_length: int, default_sampling_params: dict) -> int:
|
||||||
|
|
||||||
|
max_tokens = getattr(request, "max_completion_tokens",
|
||||||
|
None) or request.max_tokens
|
||||||
|
default_max_tokens = max_model_len - input_length
|
||||||
|
max_output_tokens = current_platform.get_max_output_tokens(input_length)
|
||||||
|
|
||||||
|
return min(val
|
||||||
|
for val in (default_max_tokens, max_tokens, max_output_tokens,
|
||||||
|
default_sampling_params.get("max_tokens"))
|
||||||
|
if val is not None)
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
@@ -184,15 +185,14 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
|||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
max_num_tokens: int,
|
max_num_tokens: int,
|
||||||
world_size: int,
|
num_dispatchers: int,
|
||||||
dp_size: int,
|
|
||||||
block_shape: list[int],
|
block_shape: list[int],
|
||||||
per_act_token_quant=False):
|
per_act_token_quant=False):
|
||||||
"""
|
"""
|
||||||
max_num_tokens: Maximum number of tokens from a DP Rank
|
max_num_tokens: Maximum number of tokens from a DP Rank
|
||||||
world_size: Number of EP ranks
|
num_dispatchers: The number of DP dispatchers.
|
||||||
dp_size: Number of data-parallel ranks
|
block_shape: Block quantization block shape.
|
||||||
block_shape: Block quantization block shape
|
per_act_token_quant: Per activation token quantization flag.
|
||||||
"""
|
"""
|
||||||
super().__init__(
|
super().__init__(
|
||||||
FusedMoEQuantConfig(
|
FusedMoEQuantConfig(
|
||||||
@@ -202,8 +202,7 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
|||||||
))
|
))
|
||||||
assert self.block_shape == self.DEEPGEMM_BLOCK_SHAPE
|
assert self.block_shape == self.DEEPGEMM_BLOCK_SHAPE
|
||||||
self.max_num_tokens = max_num_tokens
|
self.max_num_tokens = max_num_tokens
|
||||||
self.world_size = world_size
|
self.num_dispatchers = num_dispatchers
|
||||||
self.dp_size = dp_size
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def activation_formats(
|
def activation_formats(
|
||||||
@@ -233,7 +232,7 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
|||||||
# FIXME (varun): We should be able to dispatch only from the leader
|
# FIXME (varun): We should be able to dispatch only from the leader
|
||||||
# DP ranks in the case of TP > 1. At the moment, all the Ranks
|
# DP ranks in the case of TP > 1. At the moment, all the Ranks
|
||||||
# end up sending their tokens. This needs to be fixed.
|
# end up sending their tokens. This needs to be fixed.
|
||||||
num_dispatchers = self.world_size
|
num_dispatchers = self.num_dispatchers
|
||||||
num_experts = local_num_experts
|
num_experts = local_num_experts
|
||||||
max_num_tokens = a.size(
|
max_num_tokens = a.size(
|
||||||
0) if self.max_num_tokens is None else self.max_num_tokens
|
0) if self.max_num_tokens is None else self.max_num_tokens
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user